Index: include/llvm/IR/IntrinsicsX86.td
===================================================================
--- include/llvm/IR/IntrinsicsX86.td
+++ include/llvm/IR/IntrinsicsX86.td
@@ -372,28 +372,16 @@
 
 // Integer arithmetic ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse2_padds_b : GCCBuiltin<"__builtin_ia32_paddsb128">,
+  def int_x86_sse2_padds_b : // FIXME: remove this intrinsic
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
                          llvm_v16i8_ty], [IntrNoMem, Commutative]>;
-  def int_x86_sse2_padds_w : GCCBuiltin<"__builtin_ia32_paddsw128">,
+  def int_x86_sse2_padds_w : // FIXME: remove this intrinsic
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_sse2_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb128">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
-                         llvm_v16i8_ty], [IntrNoMem, Commutative]>;
-  def int_x86_sse2_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw128">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
-                         llvm_v8i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_sse2_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb128">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
-                         llvm_v16i8_ty], [IntrNoMem]>;
-  def int_x86_sse2_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw128">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
-                         llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_sse2_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb128">,
+  def int_x86_sse2_psubs_b : // FIXME: remove this intrinsic
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
                          llvm_v16i8_ty], [IntrNoMem]>;
-  def int_x86_sse2_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw128">,
+  def int_x86_sse2_psubs_w : // FIXME: remove this intrinsic
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem]>;
   def int_x86_sse2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw128">,
@@ -1377,28 +1365,16 @@
 
 // Integer arithmetic ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx2_padds_b : GCCBuiltin<"__builtin_ia32_paddsb256">,
-              Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
-                         llvm_v32i8_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx2_padds_w : GCCBuiltin<"__builtin_ia32_paddsw256">,
-              Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
-                         llvm_v16i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx2_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb256">,
+  def int_x86_avx2_padds_b : // FIXME: remove this intrinsic
               Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
                          llvm_v32i8_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx2_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw256">,
+  def int_x86_avx2_padds_w : // FIXME: remove this intrinsic
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx2_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb256">,
-              Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
-                         llvm_v32i8_ty], [IntrNoMem]>;
-  def int_x86_avx2_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw256">,
-              Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
-                         llvm_v16i16_ty], [IntrNoMem]>;
-  def int_x86_avx2_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb256">,
+  def int_x86_avx2_psubs_b : // FIXME: remove this intrinsic
               Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
                          llvm_v32i8_ty], [IntrNoMem]>;
-  def int_x86_avx2_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw256">,
+  def int_x86_avx2_psubs_w : // FIXME: remove this intrinsic
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem]>;
   def int_x86_avx2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw256">,
@@ -4242,7 +4218,7 @@
   def int_x86_avx512_mask_padds_b_256 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
                      llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_padds_b_512 : GCCBuiltin<"__builtin_ia32_paddsb512_mask">,
+  def int_x86_avx512_mask_padds_b_512 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
                      llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_padds_w_128 : // FIXME: remove this intrinsic
@@ -4251,25 +4227,7 @@
   def int_x86_avx512_mask_padds_w_256 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
                      llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_padds_w_512 : GCCBuiltin<"__builtin_ia32_paddsw512_mask">,
-          Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
-                     llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_paddus_b_128 : // FIXME: remove this intrinsic
-          Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
-                     llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_paddus_b_256 : // FIXME: remove this intrinsic
-          Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
-                     llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_paddus_b_512 : GCCBuiltin<"__builtin_ia32_paddusb512_mask">,
-          Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
-                     llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_paddus_w_128 : // FIXME: remove this intrinsic
-          Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
-                     llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_paddus_w_256 : // FIXME: remove this intrinsic
-          Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
-                     llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_paddus_w_512 : GCCBuiltin<"__builtin_ia32_paddusw512_mask">,
+  def int_x86_avx512_mask_padds_w_512 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
                      llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_psubs_b_128 : // FIXME: remove this intrinsic
@@ -4278,7 +4236,7 @@
   def int_x86_avx512_mask_psubs_b_256 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
                      llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_psubs_b_512 : GCCBuiltin<"__builtin_ia32_psubsb512_mask">,
+  def int_x86_avx512_mask_psubs_b_512 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
                      llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_psubs_w_128 : // FIXME: remove this intrinsic
@@ -4287,25 +4245,7 @@
   def int_x86_avx512_mask_psubs_w_256 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
                      llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_psubs_w_512 : GCCBuiltin<"__builtin_ia32_psubsw512_mask">,
-          Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
-                     llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_psubus_b_128 : // FIXME: remove this intrinsic
-          Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
-                     llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_psubus_b_256 : // FIXME: remove this intrinsic
-          Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
-                     llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_psubus_b_512 : GCCBuiltin<"__builtin_ia32_psubusb512_mask">,
-          Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
-                     llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_psubus_w_128 : // FIXME: remove this intrinsic
-          Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
-                     llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_psubus_w_256 : // FIXME: remove this intrinsic
-          Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
-                     llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_psubus_w_512 : GCCBuiltin<"__builtin_ia32_psubusw512_mask">,
+  def int_x86_avx512_mask_psubs_w_512 : // FIXME: remove this intrinsic
           Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
                      llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512">,
Index: lib/IR/AutoUpgrade.cpp
===================================================================
--- lib/IR/AutoUpgrade.cpp
+++ lib/IR/AutoUpgrade.cpp
@@ -84,7 +84,13 @@
   // like to use this information to remove upgrade code for some older
   // intrinsics. It is currently undecided how we will determine that future
   // point.
-  if (Name=="ssse3.pabs.b.128" || // Added in 6.0
+  if (Name.startswith("sse2.paddus") || // Added in 7.0
+      Name.startswith("sse2.psubus") || // Added in 7.0
+      Name.startswith("avx2.paddus") || // Added in 7.0
+      Name.startswith("avx2.psubus") || // Added in 7.0
+      Name.startswith("avx512.mask.paddus") || // Added in 7.0
+      Name.startswith("avx512.mask.psubus") || // Added in 7.0
+      Name=="ssse3.pabs.b.128" || // Added in 6.0
       Name=="ssse3.pabs.w.128" || // Added in 6.0
       Name=="ssse3.pabs.d.128" || // Added in 6.0
       Name.startswith("fma.vfmsub.") || // Added in 7.0
@@ -894,6 +900,37 @@
   return EmitX86Select(Builder, Mask, Align, Passthru);
 }
 
+static Value *UpgradeX86AddSubSatIntrinsics(IRBuilder<> &Builder, CallInst &CI,
+                                            bool IsAddition) {
+  Value *Op0 = CI.getOperand(0);
+  Value *Op1 = CI.getOperand(1);
+
+  // Collect vector elements and type data.
+  Type *ResultType = CI.getType();
+
+  Value *Res;
+  if (IsAddition) {
+    // ADDUS: a > (a+b) ? ~0 : (a+b)
+    // If Op0 > Add, overflow occured.
+    Value *Add = Builder.CreateAdd(Op0, Op1);
+    Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_UGT, Op0, Add);
+    Value *Max = llvm::Constant::getAllOnesValue(ResultType);
+    Res = Builder.CreateSelect(ICmp, Max, Add);
+  } else {
+    // SUBUS: max(a, b) - b
+    Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_UGT, Op0, Op1);
+    Value *Select = Builder.CreateSelect(ICmp, Op0, Op1);
+    Res = Builder.CreateSub(Select, Op1);
+  }
+
+  if (CI.getNumArgOperands() == 4) { // For masked intrinsics.
+    Value *VecSrc = CI.getOperand(2);
+    Value *Mask = CI.getOperand(3);
+    Res = EmitX86Select(Builder, Mask, Res, VecSrc);
+  }
+  return Res;
+}
+
 static Value *UpgradeMaskedStore(IRBuilder<> &Builder,
                                  Value *Ptr, Value *Data, Value *Mask,
                                  bool Aligned) {
@@ -1826,7 +1863,15 @@
       if (CI->getNumArgOperands() == 3)
         Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
-    } else if (IsX86 && Name.startswith("avx512.mask.palignr.")) {
+    } else if (IsX86 && (Name.startswith("sse2.paddus") ||
+                         Name.startswith("avx2.paddus") ||
+                         Name.startswith("avx512.mask.paddus"))) {
+      Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, true /*IsAdd*/);
+    } else if (IsX86 && (Name.startswith("sse2.psubus") ||
+                         Name.startswith("avx2.psubus") ||
+                         Name.startswith("avx512.mask.psubus"))) {
+      Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, false /*IsAdd*/);
+    }else if (IsX86 && Name.startswith("avx512.mask.palignr.")) {
       Rep = UpgradeX86ALIGNIntrinsics(Builder, CI->getArgOperand(0),
                                       CI->getArgOperand(1),
                                       CI->getArgOperand(2),
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -32258,6 +32258,52 @@
     }
   }
 
+  // Match VSELECTs into add with unsigned saturation.
+  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+      ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
+       (Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) ||
+       (Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
+                                   VT == MVT::v16i32 || MVT::v8i64)))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+    SDValue CondLHS = Cond->getOperand(0);
+    SDValue CondRHS = Cond->getOperand(1);
+
+    // Canonicalize ADD to CondRHS to simplify the logic below.
+    if (CondLHS.getOpcode() == ISD::ADD) {
+      std::swap(CondLHS, CondRHS);
+      CC = ISD::getSetCCInverse(CC, true);
+    }
+
+    // Check if one of the arms of the VSELECT is vector with all bits set.
+    // If it's on the left side invert the predicate to simplify logic below.
+    SDValue Other;
+    if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
+      Other = RHS;
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
+      Other = LHS;
+    }
+
+    // We can test against either of the addition operands.
+    if (Other.getNode() && Other->getNumOperands() == 2 &&
+        (DAG.isEqualTo(Other->getOperand(0), CondLHS) ||
+         DAG.isEqualTo(Other->getOperand(1), CondLHS))) {
+      SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
+
+      auto ADDUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                             ArrayRef<SDValue> Ops) {
+        return DAG.getNode(X86ISD::ADDUS, DL, Ops[0].getValueType(), Ops);
+      };
+      
+      // x <= x+y ? x+y : ~0 --> addus x, y
+      if ((CC == ISD::SETULE) &&
+          Other->getOpcode() == ISD::ADD && DAG.isEqualTo(Other, CondRHS))
+        return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
+                                ADDUSBuilder);
+    }
+  }
+
   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
     return V;
 
@@ -36160,6 +36206,116 @@
   return DAG.getNode(Opc, DL, VT, LHS, RHS);
 }
 
+/// This function detects the addition or subtraction with saturation pattern
+/// between 2  i8/i16 vectors and replace this operation with the
+/// efficient X86ISD::ADDUS/X86ISD::ADDS/X86ISD::SUBUS/X86ISD::SUBS instruction.
+static SDValue detectAddSubSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget,
+                                      const SDLoc &DL) {
+  if (!VT.isVector())
+    return SDValue();
+  EVT InVT = In.getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  EVT ScalarVT = VT.getVectorElementType();
+  if ((ScalarVT != MVT::i8 && ScalarVT != MVT::i16) ||
+      InVT.getSizeInBits() % 128 != 0 || !isPowerOf2_32(NumElems))
+    return SDValue();
+
+  // InScalarVT is the intermediate type in AddSubSat pattern
+  // and it should be greater than the output type.
+  EVT InScalarVT = InVT.getVectorElementType();
+  if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
+    return SDValue();
+
+  if (!Subtarget.hasSSE2())
+    return SDValue();
+
+  // Detect the following pattern:
+  // %2 = zext <16 x i8> %0 to <16 x i16>
+  // %3 = zext <16 x i8> %1 to <16 x i16>
+  // %4 = add nuw nsw <16 x i16> %3, %2
+  // %5 = icmp ult <16 x i16> %4, <16 x i16> (vector of max InScalarVT values)
+  // %6 = select <16 x i1> %5, <16 x i16> (vector of max InScalarVT values)
+  // %7 = trunc <16 x i16> %6 to <16 x i8>
+
+  // Detect a Sat Pattern
+  bool Signed = true;
+  SDValue Sat = detectSSatPattern(In, VT, false);
+  if (!Sat) {
+    Sat = detectUSatPattern(In, VT, DAG, DL);
+    Signed = false;
+  }
+  if (!Sat)
+    return SDValue();
+  if (Sat.getOpcode() != ISD::ADD && Sat.getOpcode() != ISD::SUB)
+    return SDValue();
+
+  unsigned Opcode = Sat.getOpcode() == ISD::ADD ? Signed ? X86ISD::ADDS
+                                                         : X86ISD::ADDUS
+                                                : Signed ? X86ISD::SUBS
+                                                         : X86ISD::SUBUS;
+
+  // Get addition elements.
+  SDValue LHS = Sat.getOperand(0);
+  SDValue RHS = Sat.getOperand(1);
+
+  // Don't combine if both operands are constant.
+  if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
+      ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()))
+    return SDValue();
+
+  // Check if Op is a result of type promotion.
+  auto IsExtended = [=, &DAG](SDValue Op) {
+    unsigned Opcode = Op.getOpcode();
+    unsigned EltSize = ScalarVT.getSizeInBits();
+    unsigned ExtEltSize = InScalarVT.getSizeInBits();
+    unsigned ExtPartSize = ExtEltSize - EltSize;
+
+    // Extension of non-constant operand.
+    if (Opcode == ISD::ZERO_EXTEND || Opcode == ISD::SIGN_EXTEND) {
+      if (Signed)
+        return DAG.ComputeNumSignBits(Op) > ExtPartSize;
+      else {
+        APInt HighBitsMask = APInt::getHighBitsSet(ExtEltSize, ExtPartSize);
+        return DAG.MaskedValueIsZero(Op, HighBitsMask);
+      }
+    } else if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+      // Build vector of constant nodes. Each of them needs to be a correct
+      // extension from a constant of ScalarVT type.
+      unsigned NumOperands = Op.getNumOperands();
+      for (unsigned i = 0; i < NumOperands; ++i) {
+        APInt Elt = cast<ConstantSDNode>(Op.getOperand(i))->getAPIntValue();
+        Elt = Elt.getHiBits(Elt.isSignedIntN(ExtEltSize) ? ExtPartSize + 1
+                                                         : ExtPartSize);
+        if ((Signed && (!Elt.isAllOnesValue() && !Elt.isNullValue())) ||
+            (!Signed && !Elt.isNullValue()))
+          return false;
+      }
+      return true;
+    }
+    return false;
+  };
+
+  // Either both operands are extended or one of them is extended
+  // and another one is a vector of constants.
+  if (!IsExtended(LHS) || !IsExtended(RHS))
+    return SDValue();
+
+  // Truncate extended nodes to result type.
+  LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
+  RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
+ 
+  // The pattern is detected, emit ADDS/ADDUS/SUBS/SUBUS instruction.
+  auto AddSubSatBuilder = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
+                                   ArrayRef<SDValue> Ops) {
+    EVT VT = Ops[0].getValueType();
+    return DAG.getNode(Opcode, DL, VT, Ops);
+  };
+  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { LHS, RHS },
+                          AddSubSatBuilder);
+}
+
 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
@@ -36174,6 +36330,10 @@
   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
     return Avg;
 
+  // Try to detect addition or subtraction with saturation.
+  if (SDValue AddSubSat = detectAddSubSatPattern(Src, VT, DAG, Subtarget, DL))
+    return AddSubSat;
+
   // Try to combine truncation with signed/unsigned saturation.
   if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
     return Val;
Index: lib/Target/X86/X86IntrinsicsInfo.h
===================================================================
--- lib/Target/X86/X86IntrinsicsInfo.h
+++ lib/Target/X86/X86IntrinsicsInfo.h
@@ -402,8 +402,6 @@
   X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(avx2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0),
   X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0),
-  X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
-  X86_INTRINSIC_DATA(avx2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
   X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0),
   X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0),
   X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
@@ -446,8 +444,6 @@
   X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0),
   X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
-  X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
-  X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(avx512_cvtsi2sd64,  INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
   X86_INTRINSIC_DATA(avx512_cvtsi2ss32,  INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
   X86_INTRINSIC_DATA(avx512_cvtsi2ss64,  INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
@@ -779,12 +775,6 @@
   X86_INTRINSIC_DATA(avx512_mask_padds_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
   X86_INTRINSIC_DATA(avx512_mask_padds_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
   X86_INTRINSIC_DATA(avx512_mask_padds_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_paddus_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_paddus_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_paddus_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK,
                      X86ISD::VTRUNC, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK,
@@ -929,12 +919,6 @@
   X86_INTRINSIC_DATA(avx512_mask_psubs_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
   X86_INTRINSIC_DATA(avx512_mask_psubs_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
   X86_INTRINSIC_DATA(avx512_mask_psubs_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psubus_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psubus_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psubus_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
   X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
   X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND),
@@ -1405,8 +1389,6 @@
   X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(sse2_padds_b,      INTR_TYPE_2OP, X86ISD::ADDS, 0),
   X86_INTRINSIC_DATA(sse2_padds_w,      INTR_TYPE_2OP, X86ISD::ADDS, 0),
-  X86_INTRINSIC_DATA(sse2_paddus_b,     INTR_TYPE_2OP, X86ISD::ADDUS, 0),
-  X86_INTRINSIC_DATA(sse2_paddus_w,     INTR_TYPE_2OP, X86ISD::ADDUS, 0),
   X86_INTRINSIC_DATA(sse2_pmadd_wd,     INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
   X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse2_pmulh_w,      INTR_TYPE_2OP, ISD::MULHS, 0),
@@ -1430,8 +1412,6 @@
   X86_INTRINSIC_DATA(sse2_psrli_w,      VSHIFT, X86ISD::VSRLI, 0),
   X86_INTRINSIC_DATA(sse2_psubs_b,      INTR_TYPE_2OP, X86ISD::SUBS, 0),
   X86_INTRINSIC_DATA(sse2_psubs_w,      INTR_TYPE_2OP, X86ISD::SUBS, 0),
-  X86_INTRINSIC_DATA(sse2_psubus_b,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
-  X86_INTRINSIC_DATA(sse2_psubus_w,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(sse2_sqrt_pd,      INTR_TYPE_1OP, ISD::FSQRT, 0),
   X86_INTRINSIC_DATA(sse2_ucomieq_sd,   COMI, X86ISD::UCOMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse2_ucomige_sd,   COMI, X86ISD::UCOMI, ISD::SETGE),
Index: test/CodeGen/X86/avx2-intrinsics-canonical.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/avx2-intrinsics-canonical.ll
@@ -0,0 +1,357 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X86 --check-prefix=X86-AVX
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X86 --check-prefix=X86-AVX512VL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c
+
+define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) {
+; X86-AVX-LABEL: test_x86_avx2_padds_b:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_padds_b:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_padds_b:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_padds_b:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <32 x i8> %a0 to <32 x i16>
+  %2 = sext <32 x i8> %a1 to <32 x i16>
+  %3 = add nsw <32 x i16> %1, %2
+  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <32 x i16> %7 to <32 x i8>
+  ret <32 x i8> %8
+}
+
+define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) {
+; X86-AVX-LABEL: test_x86_avx2_padds_w:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_padds_w:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_padds_w:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_padds_w:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <16 x i16> %a0 to <16 x i32>
+  %2 = sext <16 x i16> %a1 to <16 x i32>
+  %3 = add nsw <16 x i32> %1, %2
+  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  ret <16 x i16> %8
+}
+
+define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) {
+; X86-AVX-LABEL: test_x86_avx2_paddus_b:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_paddus_b:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_paddus_b:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_paddus_b:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %1 = add <32 x i8> %a0, %a1
+  %2 = icmp ugt <32 x i8> %a0, %1
+  %3 = select <32 x i1> %2, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> %1
+  ret <32 x i8> %3
+}
+
+define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) {
+; X86-AVX-LABEL: test_x86_avx2_paddus_w:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_paddus_w:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_paddus_w:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_paddus_w:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %1 = add <16 x i16> %a0, %a1
+  %2 = icmp ugt <16 x i16> %a0, %1
+  %3 = select <16 x i1> %2, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> %1
+  ret <16 x i16> %3
+}
+
+define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) {
+; X86-AVX-LABEL: test_x86_avx2_psubs_b:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_psubs_b:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_psubs_b:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_psubs_b:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <32 x i8> %a0 to <32 x i16>
+  %2 = sext <32 x i8> %a1 to <32 x i16>
+  %3 = sub nsw <32 x i16> %1, %2
+  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <32 x i16> %7 to <32 x i8>
+  ret <32 x i8> %8
+}
+
+define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) {
+; X86-AVX-LABEL: test_x86_avx2_psubs_w:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_psubs_w:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_psubs_w:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_psubs_w:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <16 x i16> %a0 to <16 x i32>
+  %2 = sext <16 x i16> %a1 to <16 x i32>
+  %3 = sub nsw <16 x i32> %1, %2
+  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  ret <16 x i16> %8
+}
+
+define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) {
+; X86-AVX-LABEL: test_x86_avx2_psubus_b:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_psubus_b:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_psubus_b:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_psubus_b:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %cmp = icmp ugt <32 x i8> %a0, %a1
+  %sel = select <32 x i1> %cmp, <32 x i8> %a0, <32 x i8> %a1
+  %sub = sub <32 x i8> %sel, %a1
+  ret <32 x i8> %sub
+}
+
+define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) {
+; X86-AVX-LABEL: test_x86_avx2_psubus_w:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_psubus_w:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_psubus_w:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_psubus_w:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %cmp = icmp ugt <16 x i16> %a0, %a1
+  %sel = select <16 x i1> %cmp, <16 x i16> %a0, <16 x i16> %a1
+  %sub = sub <16 x i16> %sel, %a1
+  ret <16 x i16> %sub
+}
+
+define <32 x i16> @test_x86_avx2_padds_w_512(<32 x i16> %a, <32 x i16> %b) {
+; X86-AVX-LABEL: test_x86_avx2_padds_w_512:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    vpaddsw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc2]
+; X86-AVX-NEXT:    vpaddsw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xed,0xcb]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_padds_w_512:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xed,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_padds_w_512:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpaddsw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc2]
+; X64-AVX-NEXT:    vpaddsw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xed,0xcb]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_padds_w_512:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xed,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <32 x i16> %a to <32 x i32>
+  %2 = sext <32 x i16> %b to <32 x i32>
+  %3 = add nsw <32 x i32> %1, %2
+  %4 = icmp slt <32 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <32 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  ret <32 x i16> %8
+}
+
+define <32 x i16> @test_x86_avx2_psubs_w_512(<32 x i16> %a, <32 x i16> %b) {
+; X86-AVX-LABEL: test_x86_avx2_psubs_w_512:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    vpsubsw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc2]
+; X86-AVX-NEXT:    vpsubsw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xe9,0xcb]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_psubs_w_512:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xe9,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_psubs_w_512:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpsubsw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc2]
+; X64-AVX-NEXT:    vpsubsw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xe9,0xcb]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_psubs_w_512:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xe9,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <32 x i16> %a to <32 x i32>
+  %2 = sext <32 x i16> %b to <32 x i32>
+  %3 = sub nsw <32 x i32> %1, %2
+  %4 = icmp slt <32 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <32 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  ret <32 x i16> %8
+}
+
+define <32 x i16> @test_x86_avx2_paddus_w_512(<32 x i16> %a, <32 x i16> %b) {
+; X86-AVX-LABEL: test_x86_avx2_paddus_w_512:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    vpaddusw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc2]
+; X86-AVX-NEXT:    vpaddusw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xdd,0xcb]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_paddus_w_512:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xdd,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_paddus_w_512:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpaddusw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc2]
+; X64-AVX-NEXT:    vpaddusw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xdd,0xcb]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_paddus_w_512:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xdd,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %1 = add <32 x i16> %a, %b
+  %2 = icmp ugt <32 x i16> %a, %1
+  %3 = select <32 x i1> %2, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> %1
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @test_x86_avx2_psubus_w_512(<32 x i16> %a, <32 x i16> %b) {
+; X86-AVX-LABEL: test_x86_avx2_psubus_w_512:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    vpsubusw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc2]
+; X86-AVX-NEXT:    vpsubusw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xd9,0xcb]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_psubus_w_512:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd9,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_psubus_w_512:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpsubusw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc2]
+; X64-AVX-NEXT:    vpsubusw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xd9,0xcb]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_psubus_w_512:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd9,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %cmp = icmp ugt <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b
+  %sub = sub <32 x i16> %sel, %b
+  ret <32 x i16> %sub
+}
Index: test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
===================================================================
--- test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
+++ test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
@@ -98,11 +98,18 @@
 ; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
-  %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %arg0, <32 x i8> %arg1)
-  %bc = bitcast <32 x i8> %res to <4 x i64>
+  %1 = sext <32 x i8> %arg0 to <32 x i16>
+  %2 = sext <32 x i8> %arg1 to <32 x i16>
+  %3 = add nsw <32 x i16> %1, %2
+  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <32 x i16> %7 to <32 x i8>
+  %bc = bitcast <32 x i8> %8 to <4 x i64>
   ret <4 x i64> %bc
 }
-declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
+
 
 define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_mm256_adds_epi16:
@@ -111,11 +118,18 @@
 ; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
-  %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %arg0, <16 x i16> %arg1)
-  %bc = bitcast <16 x i16> %res to <4 x i64>
+  %1 = sext <16 x i16> %arg0 to <16 x i32>
+  %2 = sext <16 x i16> %arg1 to <16 x i32>
+  %3 = add nsw <16 x i32> %1, %2
+  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  %bc = bitcast <16 x i16> %8 to <4 x i64>
   ret <4 x i64> %bc
 }
-declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
+
 
 define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_mm256_adds_epu8:
@@ -124,11 +138,13 @@
 ; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
-  %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %arg0, <32 x i8> %arg1)
-  %bc = bitcast <32 x i8> %res to <4 x i64>
+  %1 = add <32 x i8> %arg0, %arg1
+  %2 = icmp ugt <32 x i8> %arg0, %1
+  %3 = select <32 x i1> %2, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> %1
+  %bc = bitcast <32 x i8> %3 to <4 x i64>
   ret <4 x i64> %bc
 }
-declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
+
 
 define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_mm256_adds_epu16:
@@ -137,11 +153,12 @@
 ; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
-  %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %arg0, <16 x i16> %arg1)
-  %bc = bitcast <16 x i16> %res to <4 x i64>
+  %1 = add <16 x i16> %arg0, %arg1
+  %2 = icmp ugt <16 x i16> %arg0, %1
+  %3 = select <16 x i1> %2, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> %1
+  %bc = bitcast <16 x i16> %3 to <4 x i64>
   ret <4 x i64> %bc
 }
-declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_mm256_alignr_epi8:
@@ -2529,11 +2546,18 @@
 ; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
-  %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %arg0, <32 x i8> %arg1)
-  %bc = bitcast <32 x i8> %res to <4 x i64>
+  %1 = sext <32 x i8> %arg0 to <32 x i16>
+  %2 = sext <32 x i8> %arg1 to <32 x i16>
+  %3 = sub nsw <32 x i16> %1, %2
+  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <32 x i16> %7 to <32 x i8>
+  %bc = bitcast <32 x i8> %8 to <4 x i64>
   ret <4 x i64> %bc
 }
-declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
+
 
 define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_mm256_subs_epi16:
@@ -2542,37 +2566,49 @@
 ; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
-  %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %arg0, <16 x i16> %arg1)
-  %bc = bitcast <16 x i16> %res to <4 x i64>
+  %1 = sext <16 x i16> %arg0 to <16 x i32>
+  %2 = sext <16 x i16> %arg1 to <16 x i32>
+  %3 = sub nsw <16 x i32> %1, %2
+  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  %bc = bitcast <16 x i16> %8 to <4 x i64>
   ret <4 x i64> %bc
 }
-declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
+
 
 define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_mm256_subs_epu8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
-  %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %arg0, <32 x i8> %arg1)
-  %bc = bitcast <32 x i8> %res to <4 x i64>
+  %cmp = icmp ugt <32 x i8> %arg0, %arg1
+  %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
+  %sub = sub <32 x i8> %sel, %arg1
+  %bc = bitcast <32 x i8> %sub to <4 x i64>
   ret <4 x i64> %bc
 }
-declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
+
 
 define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_mm256_subs_epu16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
-  %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %arg0, <16 x i16> %arg1)
-  %bc = bitcast <16 x i16> %res to <4 x i64>
+  %cmp = icmp ugt <16 x i16> %arg0, %arg1
+  %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
+  %sub = sub <16 x i16> %sel, %arg1
+  %bc = bitcast <16 x i16> %sub to <4 x i64>
   ret <4 x i64> %bc
 }
-declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; CHECK-LABEL: test_mm256_unpackhi_epi8:
Index: test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
===================================================================
--- test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
+++ test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
@@ -878,3 +878,107 @@
   ret <4 x i64> %res
 }
 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
+
+
+define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) {
+; X86-AVX-LABEL: test_x86_avx2_paddus_b:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_paddus_b:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_paddus_b:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_paddus_b:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
+  ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) {
+; X86-AVX-LABEL: test_x86_avx2_paddus_w:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_paddus_w:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_paddus_w:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_paddus_w:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+
+define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) {
+; X86-AVX-LABEL: test_x86_avx2_psubus_b:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_psubus_b:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_psubus_b:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_psubus_b:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
+  ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) {
+; X86-AVX-LABEL: test_x86_avx2_psubus_w:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_psubus_w:
+; X86-AVX512VL:       ## %bb.0:
+; X86-AVX512VL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
+; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_psubus_w:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_psubus_w:
+; X64-AVX512VL:       ## %bb.0:
+; X64-AVX512VL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
+; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
Index: test/CodeGen/X86/avx2-intrinsics-x86.ll
===================================================================
--- test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -233,58 +233,6 @@
 declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
 
 
-define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_paddus_b:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_paddus_b:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_paddus_b:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_paddus_b:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
-  ret <32 x i8> %res
-}
-declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
-
-
-define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_paddus_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_paddus_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_paddus_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_paddus_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
-  ret <16 x i16> %res
-}
-declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
-
-
 define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) {
 ; X86-AVX-LABEL: test_x86_avx2_pmadd_wd:
 ; X86-AVX:       ## %bb.0:
@@ -978,58 +926,6 @@
 }
 declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
 
-
-define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psubus_b:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psubus_b:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psubus_b:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psubus_b:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
-  ret <32 x i8> %res
-}
-declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
-
-
-define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psubus_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psubus_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psubus_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psubus_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
-  ret <16 x i16> %res
-}
-declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
-
 define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) {
 ; X86-LABEL: test_x86_avx2_phadd_d:
 ; X86:       ## %bb.0:
@@ -1330,28 +1226,28 @@
 ; X86-AVX:       ## %bb.0:
 ; X86-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
 ; X86-AVX-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI54_0, kind: FK_Data_4
+; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI50_0, kind: FK_Data_4
 ; X86-AVX-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_packusdw_fold:
 ; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vmovaps LCPI54_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
+; X86-AVX512VL-NEXT:    vmovaps LCPI50_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
 ; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI54_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI50_0, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_packusdw_fold:
 ; X64-AVX:       ## %bb.0:
 ; X64-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
 ; X64-AVX-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI54_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI50_0-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: test_x86_avx2_packusdw_fold:
 ; X64-AVX512VL:       ## %bb.0:
 ; X64-AVX512VL-NEXT:    vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
 ; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI54_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI50_0-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> <i32 255, i32 32767, i32 65535, i32 -1, i32 -32767, i32 -65535, i32 0, i32 -256>)
   ret <16 x i16> %res
@@ -2071,36 +1967,36 @@
 ; X86-AVX:       ## %bb.0:
 ; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
 ; X86-AVX-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4
-; X86-AVX-NEXT:    vpsravd LCPI86_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4
+; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI82_0, kind: FK_Data_4
+; X86-AVX-NEXT:    vpsravd LCPI82_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI82_1, kind: FK_Data_4
 ; X86-AVX-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
 ; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vmovdqa LCPI86_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
+; X86-AVX512VL-NEXT:    vmovdqa LCPI82_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
 ; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpsravd LCPI86_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI82_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    vpsravd LCPI82_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI82_1, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_psrav_d_const:
 ; X64-AVX:       ## %bb.0:
 ; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
 ; X64-AVX-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI86_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI82_0-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI86_1-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI82_1-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
 ; X64-AVX512VL:       ## %bb.0:
 ; X64-AVX512VL-NEXT:    vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
 ; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI86_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI82_0-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI86_1-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI82_1-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> <i32 2, i32 9, i32 -12, i32 23>, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)
   ret <4 x i32> %res
@@ -2136,36 +2032,36 @@
 ; X86-AVX:       ## %bb.0:
 ; X86-AVX-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; X86-AVX-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4
-; X86-AVX-NEXT:    vpsravd LCPI88_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4
+; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI84_0, kind: FK_Data_4
+; X86-AVX-NEXT:    vpsravd LCPI84_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI84_1, kind: FK_Data_4
 ; X86-AVX-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
 ; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vmovdqa LCPI88_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; X86-AVX512VL-NEXT:    vmovdqa LCPI84_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpsravd LCPI88_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI84_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    vpsravd LCPI84_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI84_1, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_psrav_d_256_const:
 ; X64-AVX:       ## %bb.0:
 ; X64-AVX-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; X64-AVX-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI88_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI84_0-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI88_1-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI84_1-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
 ; X64-AVX512VL:       ## %bb.0:
 ; X64-AVX512VL-NEXT:    vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI88_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI84_0-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI88_1-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI84_1-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)
   ret <8 x i32> %res
Index: test/CodeGen/X86/avx512bw-intrinsics-canonical.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/avx512bw-intrinsics-canonical.ll
@@ -0,0 +1,675 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c
+
+define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+; AVX512BW-LABEL: test_mask_adds_epi16_rr_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = sext <32 x i16> %a to <32 x i32>
+  %2 = sext <32 x i16> %b to <32 x i32>
+  %3 = add nsw <32 x i32> %1, %2
+  %4 = icmp slt <32 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <32 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  ret <32 x i16> %8
+}
+
+define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = sext <32 x i16> %a to <32 x i32>
+  %2 = sext <32 x i16> %b to <32 x i32>
+  %3 = add nsw <32 x i32> %1, %2
+  %4 = icmp slt <32 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <32 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  %9 = bitcast i32 %mask to <32 x i1>
+  %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru
+  ret <32 x i16> %10
+}
+
+define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %1 = sext <32 x i16> %a to <32 x i32>
+  %2 = sext <32 x i16> %b to <32 x i32>
+  %3 = add nsw <32 x i32> %1, %2
+  %4 = icmp slt <32 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <32 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  %9 = bitcast i32 %mask to <32 x i1>
+  %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer
+  ret <32 x i16> %10
+}
+
+define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+; AVX512BW-LABEL: test_mask_adds_epi16_rm_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %1 = sext <32 x i16> %a to <32 x i32>
+  %2 = sext <32 x i16> %b to <32 x i32>
+  %3 = add nsw <32 x i32> %1, %2
+  %4 = icmp slt <32 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <32 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  ret <32 x i16> %8
+}
+
+define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %1 = sext <32 x i16> %a to <32 x i32>
+  %2 = sext <32 x i16> %b to <32 x i32>
+  %3 = add nsw <32 x i32> %1, %2
+  %4 = icmp slt <32 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <32 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  %9 = bitcast i32 %mask to <32 x i1>
+  %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru
+  ret <32 x i16> %10
+}
+
+define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %1 = sext <32 x i16> %a to <32 x i32>
+  %2 = sext <32 x i16> %b to <32 x i32>
+  %3 = add nsw <32 x i32> %1, %2
+  %4 = icmp slt <32 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <32 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  %9 = bitcast i32 %mask to <32 x i1>
+  %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer
+  ret <32 x i16> %10
+}
+
+define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+; AVX512BW-LABEL: test_mask_subs_epi16_rr_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = sext <32 x i16> %a to <32 x i32>
+  %2 = sext <32 x i16> %b to <32 x i32>
+  %3 = sub nsw <32 x i32> %1, %2
+  %4 = icmp slt <32 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <32 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  ret <32 x i16> %8
+}
+
+define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = sext <32 x i16> %a to <32 x i32>
+  %2 = sext <32 x i16> %b to <32 x i32>
+  %3 = sub nsw <32 x i32> %1, %2
+  %4 = icmp slt <32 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <32 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  %9 = bitcast i32 %mask to <32 x i1>
+  %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru
+  ret <32 x i16> %10
+}
+
+define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %1 = sext <32 x i16> %a to <32 x i32>
+  %2 = sext <32 x i16> %b to <32 x i32>
+  %3 = sub nsw <32 x i32> %1, %2
+  %4 = icmp slt <32 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <32 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  %9 = bitcast i32 %mask to <32 x i1>
+  %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer
+  ret <32 x i16> %10
+}
+
+define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+; AVX512BW-LABEL: test_mask_subs_epi16_rm_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %1 = sext <32 x i16> %a to <32 x i32>
+  %2 = sext <32 x i16> %b to <32 x i32>
+  %3 = sub nsw <32 x i32> %1, %2
+  %4 = icmp slt <32 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <32 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  ret <32 x i16> %8
+}
+
+define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %1 = sext <32 x i16> %a to <32 x i32>
+  %2 = sext <32 x i16> %b to <32 x i32>
+  %3 = sub nsw <32 x i32> %1, %2
+  %4 = icmp slt <32 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <32 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  %9 = bitcast i32 %mask to <32 x i1>
+  %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru
+  ret <32 x i16> %10
+}
+
+define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %1 = sext <32 x i16> %a to <32 x i32>
+  %2 = sext <32 x i16> %b to <32 x i32>
+  %3 = sub nsw <32 x i32> %1, %2
+  %4 = icmp slt <32 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <32 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <32 x i32> %7 to <32 x i16>
+  %9 = bitcast i32 %mask to <32 x i1>
+  %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer
+  ret <32 x i16> %10
+}
+
+define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+; AVX512BW-LABEL: test_mask_adds_epu16_rr_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = add <32 x i16> %a, %b
+  %2 = icmp ugt <32 x i16> %a, %1
+  %3 = select <32 x i1> %2, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> %1
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %1 = add <32 x i16> %a, %b
+  %2 = icmp ugt <32 x i16> %a, %1
+  %3 = select <32 x i1> %2, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> %1
+  %4 = bitcast i32 %mask to <32 x i1>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %passThru
+  ret <32 x i16> %5
+}
+
+define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %1 = add <32 x i16> %a, %b
+  %2 = icmp ugt <32 x i16> %a, %1
+  %3 = select <32 x i1> %2, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> %1
+  %4 = bitcast i32 %mask to <32 x i1>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
+  ret <32 x i16> %5
+}
+
+define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+; AVX512BW-LABEL: test_mask_adds_epu16_rm_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %1 = add <32 x i16> %a, %b
+  %2 = icmp ugt <32 x i16> %a, %1
+  %3 = select <32 x i1> %2, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> %1
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %1 = add <32 x i16> %a, %b
+  %2 = icmp ugt <32 x i16> %a, %1
+  %3 = select <32 x i1> %2, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> %1
+  %4 = bitcast i32 %mask to <32 x i1>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %passThru
+  ret <32 x i16> %5
+}
+
+define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %1 = add <32 x i16> %a, %b
+  %2 = icmp ugt <32 x i16> %a, %1
+  %3 = select <32 x i1> %2, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> %1
+  %4 = bitcast i32 %mask to <32 x i1>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
+  ret <32 x i16> %5
+}
+
+define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+; AVX512BW-LABEL: test_mask_subs_epu16_rr_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %cmp = icmp ugt <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b
+  %sub = sub <32 x i16> %sel, %b
+  ret <32 x i16> %sub
+}
+
+define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %cmp = icmp ugt <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b
+  %sub = sub <32 x i16> %sel, %b
+  %bc = bitcast i32 %mask to <32 x i1>
+  %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> %passThru
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %cmp = icmp ugt <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b
+  %sub = sub <32 x i16> %sel, %b
+  %bc = bitcast i32 %mask to <32 x i1>
+  %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+; AVX512BW-LABEL: test_mask_subs_epu16_rm_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %cmp = icmp ugt <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b
+  %sub = sub <32 x i16> %sel, %b
+  ret <32 x i16> %sub
+}
+
+define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %cmp = icmp ugt <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b
+  %sub = sub <32 x i16> %sel, %b
+  %bc = bitcast i32 %mask to <32 x i1>
+  %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> %passThru
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %cmp = icmp ugt <32 x i16> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b
+  %sub = sub <32 x i16> %sel, %b
+  %bc = bitcast i32 %mask to <32 x i1>
+  %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+
+define <64 x i16> @test_mask_adds_epi16_rr_1024(<64 x i16> %a, <64 x i16> %b) {
+; AVX512BW-LABEL: test_mask_adds_epi16_rr_1024:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpaddsw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddsw %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rr_1024:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    pushl %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX512F-32-NEXT:    .cfi_offset %ebp, -8
+; AVX512F-32-NEXT:    movl %esp, %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX512F-32-NEXT:    andl $-64, %esp
+; AVX512F-32-NEXT:    subl $64, %esp
+; AVX512F-32-NEXT:    vpaddsw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddsw 8(%ebp), %zmm1, %zmm1
+; AVX512F-32-NEXT:    movl %ebp, %esp
+; AVX512F-32-NEXT:    popl %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX512F-32-NEXT:    retl
+  %1 = sext <64 x i16> %a to <64 x i32>
+  %2 = sext <64 x i16> %b to <64 x i32>
+  %3 = add nsw <64 x i32> %1, %2
+  %4 = icmp slt <64 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <64 x i1> %4, <64 x i32> %3, <64 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <64 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <64 x i1> %6, <64 x i32> %5, <64 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <64 x i32> %7 to <64 x i16>
+  ret <64 x i16> %8
+}
+
+define <64 x i16> @test_mask_subs_epi16_rr_1024(<64 x i16> %a, <64 x i16> %b) {
+; AVX512BW-LABEL: test_mask_subs_epi16_rr_1024:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpsubsw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsubsw %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rr_1024:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    pushl %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX512F-32-NEXT:    .cfi_offset %ebp, -8
+; AVX512F-32-NEXT:    movl %esp, %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX512F-32-NEXT:    andl $-64, %esp
+; AVX512F-32-NEXT:    subl $64, %esp
+; AVX512F-32-NEXT:    vpsubsw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpsubsw 8(%ebp), %zmm1, %zmm1
+; AVX512F-32-NEXT:    movl %ebp, %esp
+; AVX512F-32-NEXT:    popl %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX512F-32-NEXT:    retl
+  %1 = sext <64 x i16> %a to <64 x i32>
+  %2 = sext <64 x i16> %b to <64 x i32>
+  %3 = sub nsw <64 x i32> %1, %2
+  %4 = icmp slt <64 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <64 x i1> %4, <64 x i32> %3, <64 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <64 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <64 x i1> %6, <64 x i32> %5, <64 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <64 x i32> %7 to <64 x i16>
+  ret <64 x i16> %8
+}
+
+define <64 x i16> @test_mask_adds_epu16_rr_1024(<64 x i16> %a, <64 x i16> %b) {
+; AVX512BW-LABEL: test_mask_adds_epu16_rr_1024:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpaddusw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddusw %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rr_1024:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    pushl %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX512F-32-NEXT:    .cfi_offset %ebp, -8
+; AVX512F-32-NEXT:    movl %esp, %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX512F-32-NEXT:    andl $-64, %esp
+; AVX512F-32-NEXT:    subl $64, %esp
+; AVX512F-32-NEXT:    vpaddusw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddusw 8(%ebp), %zmm1, %zmm1
+; AVX512F-32-NEXT:    movl %ebp, %esp
+; AVX512F-32-NEXT:    popl %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX512F-32-NEXT:    retl
+  %1 = add <64 x i16> %a, %b
+  %2 = icmp ugt <64 x i16> %a, %1
+  %3 = select <64 x i1> %2, <64 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <64 x i16> %1
+  ret <64 x i16> %3
+}
+
+define <64 x i16> @test_mask_subs_epu16_rr_1024(<64 x i16> %a, <64 x i16> %b) {
+; AVX512BW-LABEL: test_mask_subs_epu16_rr_1024:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpsubusw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsubusw %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rr_1024:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    pushl %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX512F-32-NEXT:    .cfi_offset %ebp, -8
+; AVX512F-32-NEXT:    movl %esp, %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX512F-32-NEXT:    andl $-64, %esp
+; AVX512F-32-NEXT:    subl $64, %esp
+; AVX512F-32-NEXT:    vpsubusw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpsubusw 8(%ebp), %zmm1, %zmm1
+; AVX512F-32-NEXT:    movl %ebp, %esp
+; AVX512F-32-NEXT:    popl %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX512F-32-NEXT:    retl
+  %cmp = icmp ugt <64 x i16> %a, %b
+  %sel = select <64 x i1> %cmp, <64 x i16> %a, <64 x i16> %b
+  %sub = sub <64 x i16> %sel, %b
+  ret <64 x i16> %sub
+}
Index: test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
===================================================================
--- test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -2865,3 +2865,211 @@
   %res2 = add <32 x i16> %res, %res1
   ret <32 x i16> %res2
 }
+
+define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+; AVX512BW-LABEL: test_mask_adds_epu16_rr_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+; AVX512BW-LABEL: test_mask_adds_epu16_rm_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %res
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+; AVX512BW-LABEL: test_mask_subs_epu16_rr_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+; AVX512BW-LABEL: test_mask_subs_epu16_rm_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %res
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
Index: test/CodeGen/X86/avx512bw-intrinsics.ll
===================================================================
--- test/CodeGen/X86/avx512bw-intrinsics.ll
+++ test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -787,213 +787,7 @@
 
 declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_mask_adds_epu16_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
-  %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
-  %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
-  %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_adds_epu16_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
-  %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
-  %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
-  %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
-  ret <32 x i16> %res
-}
-
-declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
-
-define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_mask_subs_epu16_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
-  %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
-  %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
-  %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_subs_epu16_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
-  %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
-  %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
-  %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
-  ret <32 x i16> %res
-}
-
-declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
 ; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
Index: test/CodeGen/X86/avx512bwvl-intrinsics-canonical.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/avx512bwvl-intrinsics-canonical.ll
@@ -0,0 +1,1572 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c
+
+define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_mask_adds_epi16_rr_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <8 x i16> %a to <8 x i32>
+  %2 = sext <8 x i16> %b to <8 x i32>
+  %3 = add nsw <8 x i32> %1, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  ret <8 x i16> %8
+}
+
+define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_adds_epi16_rrk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <8 x i16> %a to <8 x i32>
+  %2 = sext <8 x i16> %b to <8 x i32>
+  %3 = add nsw <8 x i32> %1, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  %9 = bitcast i8 %mask to <8 x i1>
+  %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru
+  ret <8 x i16> %10
+}
+
+define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_adds_epi16_rrkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <8 x i16> %a to <8 x i32>
+  %2 = sext <8 x i16> %b to <8 x i32>
+  %3 = add nsw <8 x i32> %1, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  %9 = bitcast i8 %mask to <8 x i1>
+  %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer
+  ret <8 x i16> %10
+}
+
+define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
+; CHECK-LABEL: test_mask_adds_epi16_rm_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %1 = sext <8 x i16> %a to <8 x i32>
+  %2 = sext <8 x i16> %b to <8 x i32>
+  %3 = add nsw <8 x i32> %1, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  ret <8 x i16> %8
+}
+
+define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_adds_epi16_rmk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %1 = sext <8 x i16> %a to <8 x i32>
+  %2 = sext <8 x i16> %b to <8 x i32>
+  %3 = add nsw <8 x i32> %1, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  %9 = bitcast i8 %mask to <8 x i1>
+  %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru
+  ret <8 x i16> %10
+}
+
+define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_adds_epi16_rmkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %1 = sext <8 x i16> %a to <8 x i32>
+  %2 = sext <8 x i16> %b to <8 x i32>
+  %3 = add nsw <8 x i32> %1, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  %9 = bitcast i8 %mask to <8 x i1>
+  %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer
+  ret <8 x i16> %10
+}
+
+define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_mask_adds_epi16_rr_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <16 x i16> %a to <16 x i32>
+  %2 = sext <16 x i16> %b to <16 x i32>
+  %3 = add nsw <16 x i32> %1, %2
+  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  ret <16 x i16> %8
+}
+
+define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epi16_rrk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <16 x i16> %a to <16 x i32>
+  %2 = sext <16 x i16> %b to <16 x i32>
+  %3 = add nsw <16 x i32> %1, %2
+  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  %9 = bitcast i16 %mask to <16 x i1>
+  %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru
+  ret <16 x i16> %10
+}
+
+define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epi16_rrkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <16 x i16> %a to <16 x i32>
+  %2 = sext <16 x i16> %b to <16 x i32>
+  %3 = add nsw <16 x i32> %1, %2
+  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  %9 = bitcast i16 %mask to <16 x i1>
+  %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer
+  ret <16 x i16> %10
+}
+
+define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
+; CHECK-LABEL: test_mask_adds_epi16_rm_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %1 = sext <16 x i16> %a to <16 x i32>
+  %2 = sext <16 x i16> %b to <16 x i32>
+  %3 = add nsw <16 x i32> %1, %2
+  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  ret <16 x i16> %8
+}
+
+define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epi16_rmk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %1 = sext <16 x i16> %a to <16 x i32>
+  %2 = sext <16 x i16> %b to <16 x i32>
+  %3 = add nsw <16 x i32> %1, %2
+  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  %9 = bitcast i16 %mask to <16 x i1>
+  %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru
+  ret <16 x i16> %10
+}
+
+define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epi16_rmkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %1 = sext <16 x i16> %a to <16 x i32>
+  %2 = sext <16 x i16> %b to <16 x i32>
+  %3 = add nsw <16 x i32> %1, %2
+  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  %9 = bitcast i16 %mask to <16 x i1>
+  %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer
+  ret <16 x i16> %10
+}
+
+define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_mask_subs_epi16_rr_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <8 x i16> %a to <8 x i32>
+  %2 = sext <8 x i16> %b to <8 x i32>
+  %3 = sub nsw <8 x i32> %1, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  ret <8 x i16> %8
+}
+
+define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_subs_epi16_rrk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <8 x i16> %a to <8 x i32>
+  %2 = sext <8 x i16> %b to <8 x i32>
+  %3 = sub nsw <8 x i32> %1, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  %9 = bitcast i8 %mask to <8 x i1>
+  %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru
+  ret <8 x i16> %10
+}
+
+define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_subs_epi16_rrkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <8 x i16> %a to <8 x i32>
+  %2 = sext <8 x i16> %b to <8 x i32>
+  %3 = sub nsw <8 x i32> %1, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  %9 = bitcast i8 %mask to <8 x i1>
+  %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer
+  ret <8 x i16> %10
+}
+
+define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
+; CHECK-LABEL: test_mask_subs_epi16_rm_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %1 = sext <8 x i16> %a to <8 x i32>
+  %2 = sext <8 x i16> %b to <8 x i32>
+  %3 = sub nsw <8 x i32> %1, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  ret <8 x i16> %8
+}
+
+define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_subs_epi16_rmk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %1 = sext <8 x i16> %a to <8 x i32>
+  %2 = sext <8 x i16> %b to <8 x i32>
+  %3 = sub nsw <8 x i32> %1, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  %9 = bitcast i8 %mask to <8 x i1>
+  %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru
+  ret <8 x i16> %10
+}
+
+define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_subs_epi16_rmkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %1 = sext <8 x i16> %a to <8 x i32>
+  %2 = sext <8 x i16> %b to <8 x i32>
+  %3 = sub nsw <8 x i32> %1, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  %9 = bitcast i8 %mask to <8 x i1>
+  %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer
+  ret <8 x i16> %10
+}
+
+define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_mask_subs_epi16_rr_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <16 x i16> %a to <16 x i32>
+  %2 = sext <16 x i16> %b to <16 x i32>
+  %3 = sub nsw <16 x i32> %1, %2
+  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  ret <16 x i16> %8
+}
+
+define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epi16_rrk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <16 x i16> %a to <16 x i32>
+  %2 = sext <16 x i16> %b to <16 x i32>
+  %3 = sub nsw <16 x i32> %1, %2
+  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  %9 = bitcast i16 %mask to <16 x i1>
+  %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru
+  ret <16 x i16> %10
+}
+
+define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epi16_rrkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <16 x i16> %a to <16 x i32>
+  %2 = sext <16 x i16> %b to <16 x i32>
+  %3 = sub nsw <16 x i32> %1, %2
+  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  %9 = bitcast i16 %mask to <16 x i1>
+  %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer
+  ret <16 x i16> %10
+}
+
+define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
+; CHECK-LABEL: test_mask_subs_epi16_rm_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %1 = sext <16 x i16> %a to <16 x i32>
+  %2 = sext <16 x i16> %b to <16 x i32>
+  %3 = sub nsw <16 x i32> %1, %2
+  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  ret <16 x i16> %8
+}
+
+define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epi16_rmk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %1 = sext <16 x i16> %a to <16 x i32>
+  %2 = sext <16 x i16> %b to <16 x i32>
+  %3 = sub nsw <16 x i32> %1, %2
+  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  %9 = bitcast i16 %mask to <16 x i1>
+  %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru
+  ret <16 x i16> %10
+}
+
+define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epi16_rmkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %1 = sext <16 x i16> %a to <16 x i32>
+  %2 = sext <16 x i16> %b to <16 x i32>
+  %3 = sub nsw <16 x i32> %1, %2
+  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <16 x i32> %7 to <16 x i16>
+  %9 = bitcast i16 %mask to <16 x i1>
+  %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer
+  ret <16 x i16> %10
+}
+
+define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_mask_adds_epu16_rr_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = add <8 x i16> %a, %b
+  %2 = icmp ugt <8 x i16> %a, %1
+  %3 = select <8 x i1> %2, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %1
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_adds_epu16_rrk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = add <8 x i16> %a, %b
+  %2 = icmp ugt <8 x i16> %a, %1
+  %3 = select <8 x i1> %2, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %1
+  %4 = bitcast i8 %mask to <8 x i1>
+  %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> %passThru
+  ret <8 x i16> %5
+}
+
+define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_adds_epu16_rrkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = add <8 x i16> %a, %b
+  %2 = icmp ugt <8 x i16> %a, %1
+  %3 = select <8 x i1> %2, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %1
+  %4 = bitcast i8 %mask to <8 x i1>
+  %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> zeroinitializer
+  ret <8 x i16> %5
+}
+
+define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
+; CHECK-LABEL: test_mask_adds_epu16_rm_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %1 = add <8 x i16> %a, %b
+  %2 = icmp ugt <8 x i16> %a, %1
+  %3 = select <8 x i1> %2, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %1
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_adds_epu16_rmk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %1 = add <8 x i16> %a, %b
+  %2 = icmp ugt <8 x i16> %a, %1
+  %3 = select <8 x i1> %2, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %1
+  %4 = bitcast i8 %mask to <8 x i1>
+  %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> %passThru
+  ret <8 x i16> %5
+}
+
+define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_adds_epu16_rmkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %1 = add <8 x i16> %a, %b
+  %2 = icmp ugt <8 x i16> %a, %1
+  %3 = select <8 x i1> %2, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %1
+  %4 = bitcast i8 %mask to <8 x i1>
+  %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> zeroinitializer
+  ret <8 x i16> %5
+}
+
+define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_mask_adds_epu16_rr_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = add <16 x i16> %a, %b
+  %2 = icmp ugt <16 x i16> %a, %1
+  %3 = select <16 x i1> %2, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> %1
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epu16_rrk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = add <16 x i16> %a, %b
+  %2 = icmp ugt <16 x i16> %a, %1
+  %3 = select <16 x i1> %2, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> %1
+  %4 = bitcast i16 %mask to <16 x i1>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> %passThru
+  ret <16 x i16> %5
+}
+
+define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epu16_rrkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = add <16 x i16> %a, %b
+  %2 = icmp ugt <16 x i16> %a, %1
+  %3 = select <16 x i1> %2, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> %1
+  %4 = bitcast i16 %mask to <16 x i1>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> zeroinitializer
+  ret <16 x i16> %5
+}
+
+define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
+; CHECK-LABEL: test_mask_adds_epu16_rm_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %1 = add <16 x i16> %a, %b
+  %2 = icmp ugt <16 x i16> %a, %1
+  %3 = select <16 x i1> %2, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> %1
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epu16_rmk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %1 = add <16 x i16> %a, %b
+  %2 = icmp ugt <16 x i16> %a, %1
+  %3 = select <16 x i1> %2, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> %1
+  %4 = bitcast i16 %mask to <16 x i1>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> %passThru
+  ret <16 x i16> %5
+}
+
+define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epu16_rmkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %1 = add <16 x i16> %a, %b
+  %2 = icmp ugt <16 x i16> %a, %1
+  %3 = select <16 x i1> %2, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> %1
+  %4 = bitcast i16 %mask to <16 x i1>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> zeroinitializer
+  ret <16 x i16> %5
+}
+
+define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_mask_subs_epu16_rr_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %cmp = icmp ugt <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
+  %sub = sub <8 x i16> %sel, %b
+  ret <8 x i16> %sub
+}
+
+define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_subs_epu16_rrk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %cmp = icmp ugt <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
+  %sub = sub <8 x i16> %sel, %b
+  %bc = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> %passThru
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_subs_epu16_rrkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %cmp = icmp ugt <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
+  %sub = sub <8 x i16> %sel, %b
+  %bc = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
+; CHECK-LABEL: test_mask_subs_epu16_rm_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %cmp = icmp ugt <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
+  %sub = sub <8 x i16> %sel, %b
+  ret <8 x i16> %sub
+}
+
+define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_subs_epu16_rmk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %cmp = icmp ugt <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
+  %sub = sub <8 x i16> %sel, %b
+  %bc = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> %passThru
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_subs_epu16_rmkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %cmp = icmp ugt <8 x i16> %a, %b
+  %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
+  %sub = sub <8 x i16> %sel, %b
+  %bc = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+
+define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_mask_subs_epu16_rr_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %cmp = icmp ugt <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b
+  %sub = sub <16 x i16> %sel, %b
+  ret <16 x i16> %sub
+}
+
+define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epu16_rrk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %cmp = icmp ugt <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b
+  %sub = sub <16 x i16> %sel, %b
+  %bc = bitcast i16 %mask to <16 x i1>
+  %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> %passThru
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epu16_rrkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %cmp = icmp ugt <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b
+  %sub = sub <16 x i16> %sel, %b
+  %bc = bitcast i16 %mask to <16 x i1>
+  %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
+; CHECK-LABEL: test_mask_subs_epu16_rm_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %cmp = icmp ugt <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b
+  %sub = sub <16 x i16> %sel, %b
+  ret <16 x i16> %sub
+}
+
+define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epu16_rmk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %cmp = icmp ugt <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b
+  %sub = sub <16 x i16> %sel, %b
+  %bc = bitcast i16 %mask to <16 x i1>
+  %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> %passThru
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epu16_rmkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %cmp = icmp ugt <16 x i16> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b
+  %sub = sub <16 x i16> %sel, %b
+  %bc = bitcast i16 %mask to <16 x i1>
+  %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+
+define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_mask_adds_epi8_rr_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <16 x i8> %a to <16 x i16>
+  %2 = sext <16 x i8> %b to <16 x i16>
+  %3 = add nsw <16 x i16> %1, %2
+  %4 = icmp slt <16 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  ret <16 x i8> %8
+}
+
+define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epi8_rrk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <16 x i8> %a to <16 x i16>
+  %2 = sext <16 x i8> %b to <16 x i16>
+  %3 = add nsw <16 x i16> %1, %2
+  %4 = icmp slt <16 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  %9 = bitcast i16 %mask to <16 x i1>
+  %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru
+  ret <16 x i8> %10
+}
+
+define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epi8_rrkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <16 x i8> %a to <16 x i16>
+  %2 = sext <16 x i8> %b to <16 x i16>
+  %3 = add nsw <16 x i16> %1, %2
+  %4 = icmp slt <16 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  %9 = bitcast i16 %mask to <16 x i1>
+  %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer
+  ret <16 x i8> %10
+}
+
+define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
+; CHECK-LABEL: test_mask_adds_epi8_rm_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %1 = sext <16 x i8> %a to <16 x i16>
+  %2 = sext <16 x i8> %b to <16 x i16>
+  %3 = add nsw <16 x i16> %1, %2
+  %4 = icmp slt <16 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  ret <16 x i8> %8
+}
+
+define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epi8_rmk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %1 = sext <16 x i8> %a to <16 x i16>
+  %2 = sext <16 x i8> %b to <16 x i16>
+  %3 = add nsw <16 x i16> %1, %2
+  %4 = icmp slt <16 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  %9 = bitcast i16 %mask to <16 x i1>
+  %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru
+  ret <16 x i8> %10
+}
+
+define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epi8_rmkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %1 = sext <16 x i8> %a to <16 x i16>
+  %2 = sext <16 x i8> %b to <16 x i16>
+  %3 = add nsw <16 x i16> %1, %2
+  %4 = icmp slt <16 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  %9 = bitcast i16 %mask to <16 x i1>
+  %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer
+  ret <16 x i8> %10
+}
+
+define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: test_mask_adds_epi8_rr_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <32 x i8> %a to <32 x i16>
+  %2 = sext <32 x i8> %b to <32 x i16>
+  %3 = add nsw <32 x i16> %1, %2
+  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <32 x i16> %7 to <32 x i8>
+  ret <32 x i8> %8
+}
+
+define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_mask_adds_epi8_rrk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <32 x i8> %a to <32 x i16>
+  %2 = sext <32 x i8> %b to <32 x i16>
+  %3 = add nsw <32 x i16> %1, %2
+  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <32 x i16> %7 to <32 x i8>
+  %9 = bitcast i32 %mask to <32 x i1>
+  %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru
+  ret <32 x i8> %10
+}
+
+define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_adds_epi8_rrkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <32 x i8> %a to <32 x i16>
+  %2 = sext <32 x i8> %b to <32 x i16>
+  %3 = add nsw <32 x i16> %1, %2
+  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <32 x i16> %7 to <32 x i8>
+  %9 = bitcast i32 %mask to <32 x i1>
+  %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer
+  ret <32 x i8> %10
+}
+
+define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
+; CHECK-LABEL: test_mask_adds_epi8_rm_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %1 = sext <32 x i8> %a to <32 x i16>
+  %2 = sext <32 x i8> %b to <32 x i16>
+  %3 = add nsw <32 x i16> %1, %2
+  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <32 x i16> %7 to <32 x i8>
+  ret <32 x i8> %8
+}
+
+define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_mask_adds_epi8_rmk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %1 = sext <32 x i8> %a to <32 x i16>
+  %2 = sext <32 x i8> %b to <32 x i16>
+  %3 = add nsw <32 x i16> %1, %2
+  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <32 x i16> %7 to <32 x i8>
+  %9 = bitcast i32 %mask to <32 x i1>
+  %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru
+  ret <32 x i8> %10
+}
+
+define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
+; CHECK-LABEL: test_mask_adds_epi8_rmkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %1 = sext <32 x i8> %a to <32 x i16>
+  %2 = sext <32 x i8> %b to <32 x i16>
+  %3 = add nsw <32 x i16> %1, %2
+  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <32 x i16> %7 to <32 x i8>
+  %9 = bitcast i32 %mask to <32 x i1>
+  %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer
+  ret <32 x i8> %10
+}
+
+define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_mask_subs_epi8_rr_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <16 x i8> %a to <16 x i16>
+  %2 = sext <16 x i8> %b to <16 x i16>
+  %3 = sub nsw <16 x i16> %1, %2
+  %4 = icmp slt <16 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  ret <16 x i8> %8
+}
+
+define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epi8_rrk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <16 x i8> %a to <16 x i16>
+  %2 = sext <16 x i8> %b to <16 x i16>
+  %3 = sub nsw <16 x i16> %1, %2
+  %4 = icmp slt <16 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  %9 = bitcast i16 %mask to <16 x i1>
+  %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru
+  ret <16 x i8> %10
+}
+
+define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epi8_rrkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <16 x i8> %a to <16 x i16>
+  %2 = sext <16 x i8> %b to <16 x i16>
+  %3 = sub nsw <16 x i16> %1, %2
+  %4 = icmp slt <16 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  %9 = bitcast i16 %mask to <16 x i1>
+  %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer
+  ret <16 x i8> %10
+}
+
+define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
+; CHECK-LABEL: test_mask_subs_epi8_rm_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %1 = sext <16 x i8> %a to <16 x i16>
+  %2 = sext <16 x i8> %b to <16 x i16>
+  %3 = sub nsw <16 x i16> %1, %2
+  %4 = icmp slt <16 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  ret <16 x i8> %8
+}
+
+define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epi8_rmk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %1 = sext <16 x i8> %a to <16 x i16>
+  %2 = sext <16 x i8> %b to <16 x i16>
+  %3 = sub nsw <16 x i16> %1, %2
+  %4 = icmp slt <16 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  %9 = bitcast i16 %mask to <16 x i1>
+  %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru
+  ret <16 x i8> %10
+}
+
+define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epi8_rmkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %1 = sext <16 x i8> %a to <16 x i16>
+  %2 = sext <16 x i8> %b to <16 x i16>
+  %3 = sub nsw <16 x i16> %1, %2
+  %4 = icmp slt <16 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  %9 = bitcast i16 %mask to <16 x i1>
+  %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer
+  ret <16 x i8> %10
+}
+
+define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: test_mask_subs_epi8_rr_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <32 x i8> %a to <32 x i16>
+  %2 = sext <32 x i8> %b to <32 x i16>
+  %3 = sub nsw <32 x i16> %1, %2
+  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <32 x i16> %7 to <32 x i8>
+  ret <32 x i8> %8
+}
+
+define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_mask_subs_epi8_rrk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <32 x i8> %a to <32 x i16>
+  %2 = sext <32 x i8> %b to <32 x i16>
+  %3 = sub nsw <32 x i16> %1, %2
+  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <32 x i16> %7 to <32 x i8>
+  %9 = bitcast i32 %mask to <32 x i1>
+  %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru
+  ret <32 x i8> %10
+}
+
+define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_subs_epi8_rrkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = sext <32 x i8> %a to <32 x i16>
+  %2 = sext <32 x i8> %b to <32 x i16>
+  %3 = sub nsw <32 x i16> %1, %2
+  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <32 x i16> %7 to <32 x i8>
+  %9 = bitcast i32 %mask to <32 x i1>
+  %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer
+  ret <32 x i8> %10
+}
+
+define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
+; CHECK-LABEL: test_mask_subs_epi8_rm_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %1 = sext <32 x i8> %a to <32 x i16>
+  %2 = sext <32 x i8> %b to <32 x i16>
+  %3 = sub nsw <32 x i16> %1, %2
+  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <32 x i16> %7 to <32 x i8>
+  ret <32 x i8> %8
+}
+
+define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_mask_subs_epi8_rmk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %1 = sext <32 x i8> %a to <32 x i16>
+  %2 = sext <32 x i8> %b to <32 x i16>
+  %3 = sub nsw <32 x i16> %1, %2
+  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <32 x i16> %7 to <32 x i8>
+  %9 = bitcast i32 %mask to <32 x i1>
+  %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru
+  ret <32 x i8> %10
+}
+
+define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
+; CHECK-LABEL: test_mask_subs_epi8_rmkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %1 = sext <32 x i8> %a to <32 x i16>
+  %2 = sext <32 x i8> %b to <32 x i16>
+  %3 = sub nsw <32 x i16> %1, %2
+  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <32 x i16> %7 to <32 x i8>
+  %9 = bitcast i32 %mask to <32 x i1>
+  %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer
+  ret <32 x i8> %10
+}
+
+define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_mask_adds_epu8_rr_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = add <16 x i8> %a, %b
+  %2 = icmp ugt <16 x i8> %a, %1
+  %3 = select <16 x i1> %2, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %1
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epu8_rrk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = add <16 x i8> %a, %b
+  %2 = icmp ugt <16 x i8> %a, %1
+  %3 = select <16 x i1> %2, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %1
+  %4 = bitcast i16 %mask to <16 x i1>
+  %5 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> %passThru
+  ret <16 x i8> %5
+}
+
+define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epu8_rrkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = add <16 x i8> %a, %b
+  %2 = icmp ugt <16 x i8> %a, %1
+  %3 = select <16 x i1> %2, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %1
+  %4 = bitcast i16 %mask to <16 x i1>
+  %5 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> zeroinitializer
+  ret <16 x i8> %5
+}
+
+define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
+; CHECK-LABEL: test_mask_adds_epu8_rm_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %1 = add <16 x i8> %a, %b
+  %2 = icmp ugt <16 x i8> %a, %1
+  %3 = select <16 x i1> %2, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %1
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epu8_rmk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %1 = add <16 x i8> %a, %b
+  %2 = icmp ugt <16 x i8> %a, %1
+  %3 = select <16 x i1> %2, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %1
+  %4 = bitcast i16 %mask to <16 x i1>
+  %5 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> %passThru
+  ret <16 x i8> %5
+}
+
+define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epu8_rmkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %1 = add <16 x i8> %a, %b
+  %2 = icmp ugt <16 x i8> %a, %1
+  %3 = select <16 x i1> %2, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %1
+  %4 = bitcast i16 %mask to <16 x i1>
+  %5 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> zeroinitializer
+  ret <16 x i8> %5
+}
+
+define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: test_mask_adds_epu8_rr_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = add <32 x i8> %a, %b
+  %2 = icmp ugt <32 x i8> %a, %1
+  %3 = select <32 x i1> %2, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> %1
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_mask_adds_epu8_rrk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = add <32 x i8> %a, %b
+  %2 = icmp ugt <32 x i8> %a, %1
+  %3 = select <32 x i1> %2, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> %1
+  %4 = bitcast i32 %mask to <32 x i1>
+  %5 = select <32 x i1> %4, <32 x i8> %3, <32 x i8> %passThru
+  ret <32 x i8> %5
+}
+
+define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_adds_epu8_rrkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %1 = add <32 x i8> %a, %b
+  %2 = icmp ugt <32 x i8> %a, %1
+  %3 = select <32 x i1> %2, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> %1
+  %4 = bitcast i32 %mask to <32 x i1>
+  %5 = select <32 x i1> %4, <32 x i8> %3, <32 x i8> zeroinitializer
+  ret <32 x i8> %5
+}
+
+define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
+; CHECK-LABEL: test_mask_adds_epu8_rm_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %1 = add <32 x i8> %a, %b
+  %2 = icmp ugt <32 x i8> %a, %1
+  %3 = select <32 x i1> %2, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> %1
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_mask_adds_epu8_rmk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %1 = add <32 x i8> %a, %b
+  %2 = icmp ugt <32 x i8> %a, %1
+  %3 = select <32 x i1> %2, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> %1
+  %4 = bitcast i32 %mask to <32 x i1>
+  %5 = select <32 x i1> %4, <32 x i8> %3, <32 x i8> %passThru
+  ret <32 x i8> %5
+}
+
+define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
+; CHECK-LABEL: test_mask_adds_epu8_rmkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %1 = add <32 x i8> %a, %b
+  %2 = icmp ugt <32 x i8> %a, %1
+  %3 = select <32 x i1> %2, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> %1
+  %4 = bitcast i32 %mask to <32 x i1>
+  %5 = select <32 x i1> %4, <32 x i8> %3, <32 x i8> zeroinitializer
+  ret <32 x i8> %5
+}
+
+define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_mask_subs_epu8_rr_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %cmp = icmp ugt <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b
+  %sub = sub <16 x i8> %sel, %b
+  ret <16 x i8> %sub
+}
+
+define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epu8_rrk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %cmp = icmp ugt <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b
+  %sub = sub <16 x i8> %sel, %b
+  %bc = bitcast i16 %mask to <16 x i1>
+  %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> %passThru
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epu8_rrkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %cmp = icmp ugt <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b
+  %sub = sub <16 x i8> %sel, %b
+  %bc = bitcast i16 %mask to <16 x i1>
+  %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> zeroinitializer
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
+; CHECK-LABEL: test_mask_subs_epu8_rm_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %cmp = icmp ugt <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b
+  %sub = sub <16 x i8> %sel, %b
+  ret <16 x i8> %sub
+}
+
+define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epu8_rmk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %cmp = icmp ugt <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b
+  %sub = sub <16 x i8> %sel, %b
+  %bc = bitcast i16 %mask to <16 x i1>
+  %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> %passThru
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epu8_rmkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %cmp = icmp ugt <16 x i8> %a, %b
+  %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b
+  %sub = sub <16 x i8> %sel, %b
+  %bc = bitcast i16 %mask to <16 x i1>
+  %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> zeroinitializer
+  ret <16 x i8> %res
+}
+
+define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: test_mask_subs_epu8_rr_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %cmp = icmp ugt <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b
+  %sub = sub <32 x i8> %sel, %b
+  ret <32 x i8> %sub
+}
+
+define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_mask_subs_epu8_rrk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %cmp = icmp ugt <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b
+  %sub = sub <32 x i8> %sel, %b
+  %bc = bitcast i32 %mask to <32 x i1>
+  %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> %passThru
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_subs_epu8_rrkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %cmp = icmp ugt <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b
+  %sub = sub <32 x i8> %sel, %b
+  %bc = bitcast i32 %mask to <32 x i1>
+  %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> zeroinitializer
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
+; CHECK-LABEL: test_mask_subs_epu8_rm_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %cmp = icmp ugt <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b
+  %sub = sub <32 x i8> %sel, %b
+  ret <32 x i8> %sub
+}
+
+define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_mask_subs_epu8_rmk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %cmp = icmp ugt <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b
+  %sub = sub <32 x i8> %sel, %b
+  %bc = bitcast i32 %mask to <32 x i1>
+  %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> %passThru
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
+; CHECK-LABEL: test_mask_subs_epu8_rmkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %cmp = icmp ugt <32 x i8> %a, %b
+  %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b
+  %sub = sub <32 x i8> %sel, %b
+  %bc = bitcast i32 %mask to <32 x i1>
+  %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> zeroinitializer
+  ret <32 x i8> %res
+}
Index: test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
===================================================================
--- test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
+++ test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -4152,3 +4152,523 @@
   %res2 = add <16 x i16> %res, %res1
   ret <16 x i16> %res2
 }
+
+define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_mask_adds_epu16_rr_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_adds_epu16_rrk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_adds_epu16_rrkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
+; CHECK-LABEL: test_mask_adds_epu16_rm_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_adds_epu16_rmk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_adds_epu16_rmkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
+  ret <8 x i16> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_mask_adds_epu16_rr_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epu16_rrk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epu16_rrkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
+; CHECK-LABEL: test_mask_adds_epu16_rm_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epu16_rmk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epu16_rmkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_mask_subs_epu16_rr_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_subs_epu16_rrk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_subs_epu16_rrkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
+; CHECK-LABEL: test_mask_subs_epu16_rm_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_subs_epu16_rmk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_subs_epu16_rmkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
+  ret <8 x i16> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_mask_subs_epu16_rr_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epu16_rrk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epu16_rrkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
+; CHECK-LABEL: test_mask_subs_epu16_rm_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epu16_rmk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epu16_rmkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_mask_adds_epu8_rr_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epu8_rrk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epu8_rrkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
+; CHECK-LABEL: test_mask_adds_epu8_rm_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epu8_rmk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_adds_epu8_rmkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
+  ret <16 x i8> %res
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: test_mask_adds_epu8_rr_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_mask_adds_epu8_rrk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_adds_epu8_rrkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
+; CHECK-LABEL: test_mask_adds_epu8_rm_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_mask_adds_epu8_rmk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
+; CHECK-LABEL: test_mask_adds_epu8_rmkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
+  ret <32 x i8> %res
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_mask_subs_epu8_rr_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epu8_rrk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epu8_rrkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
+; CHECK-LABEL: test_mask_subs_epu8_rm_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epu8_rmk_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_subs_epu8_rmkz_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i8>, <16 x i8>* %ptr_b
+  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
+  ret <16 x i8> %res
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: test_mask_subs_epu8_rr_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_mask_subs_epu8_rrk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_subs_epu8_rrkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
+; CHECK-LABEL: test_mask_subs_epu8_rm_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_mask_subs_epu8_rmk_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
+; CHECK-LABEL: test_mask_subs_epu8_rmkz_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <32 x i8>, <32 x i8>* %ptr_b
+  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
+  ret <32 x i8> %res
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
Index: test/CodeGen/X86/avx512bwvl-intrinsics.ll
===================================================================
--- test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -1018,266 +1018,6 @@
 
 declare <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
 
-define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_mask_adds_epu16_rr_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_adds_epu16_rrk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpaddusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_adds_epu16_rrkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
-; CHECK-LABEL: test_mask_adds_epu16_rm_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_adds_epu16_rmk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpaddusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mask_adds_epu16_rmkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
-  ret <8 x i16> %res
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-
-define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: test_mask_adds_epu16_rr_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_adds_epu16_rrk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1]
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_adds_epu16_rrkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
-; CHECK-LABEL: test_mask_adds_epu16_rm_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_adds_epu16_rmk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpaddusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
-; CHECK-LABEL: test_mask_adds_epu16_rmkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
-  ret <16 x i16> %res
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-
-define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_mask_subs_epu16_rr_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_subs_epu16_rrk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpsubusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_subs_epu16_rrkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
-; CHECK-LABEL: test_mask_subs_epu16_rm_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_subs_epu16_rmk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpsubusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mask_subs_epu16_rmkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
-  ret <8 x i16> %res
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-
-define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: test_mask_subs_epu16_rr_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_subs_epu16_rrk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1]
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_subs_epu16_rrkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
-; CHECK-LABEL: test_mask_subs_epu16_rm_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_subs_epu16_rmk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpsubusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
-; CHECK-LABEL: test_mask_subs_epu16_rmkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
-  ret <16 x i16> %res
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-
 define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: test_mask_adds_epi8_rr_128:
 ; CHECK:       ## %bb.0:
@@ -1538,265 +1278,7 @@
 
 declare <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
 
-define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_mask_adds_epu8_rr_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_adds_epu8_rrk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpaddusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_adds_epu8_rrkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
-; CHECK-LABEL: test_mask_adds_epu8_rm_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i8>, <16 x i8>* %ptr_b
-  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_adds_epu8_rmk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpaddusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i8>, <16 x i8>* %ptr_b
-  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
-; CHECK-LABEL: test_mask_adds_epu8_rmkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i8>, <16 x i8>* %ptr_b
-  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
-  ret <16 x i8> %res
-}
-
-declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-
-define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-LABEL: test_mask_adds_epu8_rr_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
-; CHECK-LABEL: test_mask_adds_epu8_rrk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1]
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
-; CHECK-LABEL: test_mask_adds_epu8_rrkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
-; CHECK-LABEL: test_mask_adds_epu8_rm_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <32 x i8>, <32 x i8>* %ptr_b
-  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
-; CHECK-LABEL: test_mask_adds_epu8_rmk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpaddusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x0f]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <32 x i8>, <32 x i8>* %ptr_b
-  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
-; CHECK-LABEL: test_mask_adds_epu8_rmkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <32 x i8>, <32 x i8>* %ptr_b
-  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
-  ret <32 x i8> %res
-}
-
-declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-
-define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_mask_subs_epu8_rr_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_subs_epu8_rrk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpsubusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_subs_epu8_rrkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
-; CHECK-LABEL: test_mask_subs_epu8_rm_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i8>, <16 x i8>* %ptr_b
-  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_subs_epu8_rmk_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpsubusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i8>, <16 x i8>* %ptr_b
-  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
-; CHECK-LABEL: test_mask_subs_epu8_rmkz_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <16 x i8>, <16 x i8>* %ptr_b
-  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
-  ret <16 x i8> %res
-}
-
-declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-
-define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-LABEL: test_mask_subs_epu8_rr_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
-; CHECK-LABEL: test_mask_subs_epu8_rrk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1]
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
-; CHECK-LABEL: test_mask_subs_epu8_rrkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
-; CHECK-LABEL: test_mask_subs_epu8_rm_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <32 x i8>, <32 x i8>* %ptr_b
-  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
-; CHECK-LABEL: test_mask_subs_epu8_rmk_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpsubusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x0f]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <32 x i8>, <32 x i8>* %ptr_b
-  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
-; CHECK-LABEL: test_mask_subs_epu8_rmkz_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %b = load <32 x i8>, <32 x i8>* %ptr_b
-  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
-  ret <32 x i8> %res
-}
-
-declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
 
 define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128:
Index: test/CodeGen/X86/sse2-intrinsics-canonical.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/sse2-intrinsics-canonical.ll
@@ -0,0 +1,664 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=AVX2
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=SKX
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c
+
+define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_x86_sse2_padds_b:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    paddsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xec,0xc1]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_padds_b:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xec,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_padds_b:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %1 = sext <16 x i8> %a0 to <16 x i16>
+  %2 = sext <16 x i8> %a1 to <16 x i16>
+  %3 = add nsw <16 x i16> %1, %2
+  %4 = icmp slt <16 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  ret <16 x i8> %8
+}
+
+define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_x86_sse2_padds_w:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    paddsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xed,0xc1]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_padds_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xed,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_padds_w:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %1 = sext <8 x i16> %a0 to <8 x i32>
+  %2 = sext <8 x i16> %a1 to <8 x i32>
+  %3 = add nsw <8 x i32> %1, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  ret <8 x i16> %8
+}
+
+define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_x86_sse2_paddus_b:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    paddusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdc,0xc1]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_paddus_b:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_paddus_b:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %1 = add <16 x i8> %a0, %a1
+  %2 = icmp ugt <16 x i8> %a0, %1
+  %3 = select <16 x i1> %2, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %1
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_x86_sse2_paddus_w:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    paddusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdd,0xc1]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_paddus_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_paddus_w:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %1 = add <8 x i16> %a0, %a1
+  %2 = icmp ugt <8 x i16> %a0, %1
+  %3 = select <8 x i1> %2, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %1
+  ret <8 x i16> %3
+}
+
+define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_x86_sse2_psubs_b:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    psubsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe8,0xc1]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_psubs_b:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe8,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_psubs_b:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %1 = sext <16 x i8> %a0 to <16 x i16>
+  %2 = sext <16 x i8> %a1 to <16 x i16>
+  %3 = sub nsw <16 x i16> %1, %2
+  %4 = icmp slt <16 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  ret <16 x i8> %8
+}
+
+define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_x86_sse2_psubs_w:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    psubsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe9,0xc1]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_psubs_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe9,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_psubs_w:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %1 = sext <8 x i16> %a0 to <8 x i32>
+  %2 = sext <8 x i16> %a1 to <8 x i32>
+  %3 = sub nsw <8 x i32> %1, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  ret <8 x i16> %8
+}
+
+define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_x86_sse2_psubus_b:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    psubusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd8,0xc1]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_psubus_b:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd8,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_psubus_b:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %cmp = icmp ugt <16 x i8> %a0, %a1
+  %sel = select <16 x i1> %cmp, <16 x i8> %a0, <16 x i8> %a1
+  %sub = sub <16 x i8> %sel, %a1
+  ret <16 x i8> %sub
+}
+
+define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_x86_sse2_psubus_w:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    psubusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd9,0xc1]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_psubus_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd9,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_psubus_w:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %cmp = icmp ugt <8 x i16> %a0, %a1
+  %sel = select <8 x i1> %cmp, <8 x i16> %a0, <8 x i16> %a1
+  %sub = sub <8 x i16> %sel, %a1
+  ret <8 x i16> %sub
+}
+
+define <8 x i8> @test_x86_sse2_padds_b_64(<8 x i8> %a0, <8 x i8> %a1) {
+; SSE-LABEL: test_x86_sse2_padds_b_64:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    psllw $8, %xmm0 ## encoding: [0x66,0x0f,0x71,0xf0,0x08]
+; SSE-NEXT:    psraw $8, %xmm0 ## encoding: [0x66,0x0f,0x71,0xe0,0x08]
+; SSE-NEXT:    psllw $8, %xmm1 ## encoding: [0x66,0x0f,0x71,0xf1,0x08]
+; SSE-NEXT:    psraw $8, %xmm1 ## encoding: [0x66,0x0f,0x71,0xe1,0x08]
+; SSE-NEXT:    paddw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xfd,0xc1]
+; SSE-NEXT:    pminsw LCPI8_0, %xmm0 ## encoding: [0x66,0x0f,0xea,0x05,A,A,A,A]
+; SSE-NEXT:    ## fixup A - offset: 4, value: LCPI8_0, kind: FK_Data_4
+; SSE-NEXT:    pmaxsw LCPI8_1, %xmm0 ## encoding: [0x66,0x0f,0xee,0x05,A,A,A,A]
+; SSE-NEXT:    ## fixup A - offset: 4, value: LCPI8_1, kind: FK_Data_4
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_padds_b_64:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xf0,0x08]
+; AVX2-NEXT:    vpsraw $8, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xe0,0x08]
+; AVX2-NEXT:    vpsllw $8, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x71,0xf1,0x08]
+; AVX2-NEXT:    vpsraw $8, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x71,0xe1,0x08]
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; AVX2-NEXT:    vpminsw LCPI8_0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xea,0x05,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI8_0, kind: FK_Data_4
+; AVX2-NEXT:    vpmaxsw LCPI8_1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xee,0x05,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI8_1, kind: FK_Data_4
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_padds_b_64:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpsllw $8, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x08]
+; SKX-NEXT:    vpsraw $8, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xe0,0x08]
+; SKX-NEXT:    vpsllw $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x71,0xf1,0x08]
+; SKX-NEXT:    vpsraw $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x71,0xe1,0x08]
+; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
+; SKX-NEXT:    vpminsw LCPI8_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0x05,A,A,A,A]
+; SKX-NEXT:    ## fixup A - offset: 4, value: LCPI8_0, kind: FK_Data_4
+; SKX-NEXT:    vpmaxsw LCPI8_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0x05,A,A,A,A]
+; SKX-NEXT:    ## fixup A - offset: 4, value: LCPI8_1, kind: FK_Data_4
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %1 = sext <8 x i8> %a0 to <8 x i16>
+  %2 = sext <8 x i8> %a1 to <8 x i16>
+  %3 = add nsw <8 x i16> %1, %2
+  %4 = icmp slt <8 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <8 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <8 x i16> %7 to <8 x i8>
+  ret <8 x i8> %8
+}
+
+define <4 x i16> @test_x86_sse2_padds_w_64(<4 x i16> %a0, <4 x i16> %a1) {
+; SSE-LABEL: test_x86_sse2_padds_w_64:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    pslld $16, %xmm0 ## encoding: [0x66,0x0f,0x72,0xf0,0x10]
+; SSE-NEXT:    psrad $16, %xmm0 ## encoding: [0x66,0x0f,0x72,0xe0,0x10]
+; SSE-NEXT:    pslld $16, %xmm1 ## encoding: [0x66,0x0f,0x72,0xf1,0x10]
+; SSE-NEXT:    psrad $16, %xmm1 ## encoding: [0x66,0x0f,0x72,0xe1,0x10]
+; SSE-NEXT:    paddd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0xfe,0xc8]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [32767,32767,32767,32767]
+; SSE-NEXT:    ## encoding: [0x66,0x0f,0x6f,0x05,A,A,A,A]
+; SSE-NEXT:    ## fixup A - offset: 4, value: LCPI9_0, kind: FK_Data_4
+; SSE-NEXT:    movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0]
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm2 ## encoding: [0x66,0x0f,0x66,0xd1]
+; SSE-NEXT:    pand %xmm2, %xmm1 ## encoding: [0x66,0x0f,0xdb,0xca]
+; SSE-NEXT:    pandn %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd0]
+; SSE-NEXT:    por %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xeb,0xd1]
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528]
+; SSE-NEXT:    ## encoding: [0x66,0x0f,0x6f,0x0d,A,A,A,A]
+; SSE-NEXT:    ## fixup A - offset: 4, value: LCPI9_1, kind: FK_Data_4
+; SSE-NEXT:    movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2]
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x66,0xc1]
+; SSE-NEXT:    pand %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xdb,0xd0]
+; SSE-NEXT:    pandn %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdf,0xc1]
+; SSE-NEXT:    por %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xeb,0xc2]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_padds_w_64:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpslld $16, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xf0,0x10]
+; AVX2-NEXT:    vpsrad $16, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xe0,0x10]
+; AVX2-NEXT:    vpslld $16, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xf1,0x10]
+; AVX2-NEXT:    vpsrad $16, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xe1,0x10]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfe,0xc1]
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767]
+; AVX2-NEXT:    ## encoding: [0xc4,0xe2,0x79,0x58,0x0d,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 5, value: LCPI9_0, kind: FK_Data_4
+; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x39,0xc1]
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528]
+; AVX2-NEXT:    ## encoding: [0xc4,0xe2,0x79,0x58,0x0d,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 5, value: LCPI9_1, kind: FK_Data_4
+; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3d,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_padds_w_64:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpslld $16, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x10]
+; SKX-NEXT:    vpsrad $16, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xe0,0x10]
+; SKX-NEXT:    vpslld $16, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xf1,0x10]
+; SKX-NEXT:    vpsrad $16, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xe1,0x10]
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
+; SKX-NEXT:    vpminsd LCPI9_0{1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x39,0x05,A,A,A,A]
+; SKX-NEXT:    ## fixup A - offset: 6, value: LCPI9_0, kind: FK_Data_4
+; SKX-NEXT:    vpmaxsd LCPI9_1{1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x3d,0x05,A,A,A,A]
+; SKX-NEXT:    ## fixup A - offset: 6, value: LCPI9_1, kind: FK_Data_4
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %1 = sext <4 x i16> %a0 to <4 x i32>
+  %2 = sext <4 x i16> %a1 to <4 x i32>
+  %3 = add nsw <4 x i32> %1, %2
+  %4 = icmp slt <4 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <4 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <4 x i32> %7 to <4 x i16>
+  ret <4 x i16> %8
+}
+
+define <8 x i8> @test_x86_sse2_paddus_b_64(<8 x i8> %a0, <8 x i8> %a1) {
+; SSE-LABEL: test_x86_sse2_paddus_b_64:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE-NEXT:    ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A]
+; SSE-NEXT:    ## fixup A - offset: 4, value: LCPI10_0, kind: FK_Data_4
+; SSE-NEXT:    paddw %xmm0, %xmm1 ## encoding: [0x66,0x0f,0xfd,0xc8]
+; SSE-NEXT:    pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2]
+; SSE-NEXT:    pand %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdb,0xd1]
+; SSE-NEXT:    pcmpgtw %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x65,0xc2]
+; SSE-NEXT:    movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0]
+; SSE-NEXT:    pandn %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd1]
+; SSE-NEXT:    pand LCPI10_0, %xmm0 ## encoding: [0x66,0x0f,0xdb,0x05,A,A,A,A]
+; SSE-NEXT:    ## fixup A - offset: 4, value: LCPI10_0, kind: FK_Data_4
+; SSE-NEXT:    por %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xeb,0xc2]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_paddus_b_64:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI10_0, kind: FK_Data_4
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm3 ## encoding: [0xc5,0xf9,0xdb,0xda]
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm1 ## encoding: [0xc5,0xf9,0xdb,0xca]
+; AVX2-NEXT:    vpcmpgtw %xmm1, %xmm3, %xmm1 ## encoding: [0xc5,0xe1,0x65,0xc9]
+; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4c,0xc2,0x10]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_paddus_b_64:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vmovdqa LCPI10_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SKX-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A]
+; SKX-NEXT:    ## fixup A - offset: 4, value: LCPI10_0, kind: FK_Data_4
+; SKX-NEXT:    vpand %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xda]
+; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
+; SKX-NEXT:    vpand %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xca]
+; SKX-NEXT:    vpcmpnleuw %xmm1, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x3e,0xc9,0x06]
+; SKX-NEXT:    vmovdqu16 LCPI10_0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x05,A,A,A,A]
+; SKX-NEXT:    ## fixup A - offset: 6, value: LCPI10_0, kind: FK_Data_4
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %1 = add <8 x i8> %a0, %a1
+  %2 = icmp ugt <8 x i8> %a0, %1
+  %3 = select <8 x i1> %2, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> %1
+  ret <8 x i8> %3
+}
+
+define <4 x i16> @test_x86_sse2_paddus_w_64(<4 x i16> %a0, <4 x i16> %a1) {
+; SSE-LABEL: test_x86_sse2_paddus_w_64:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
+; SSE-NEXT:    ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A]
+; SSE-NEXT:    ## fixup A - offset: 4, value: LCPI11_0, kind: FK_Data_4
+; SSE-NEXT:    paddd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0xfe,0xc8]
+; SSE-NEXT:    pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2]
+; SSE-NEXT:    pand %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdb,0xd1]
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x66,0xc2]
+; SSE-NEXT:    movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0]
+; SSE-NEXT:    pandn %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd1]
+; SSE-NEXT:    pand LCPI11_0, %xmm0 ## encoding: [0x66,0x0f,0xdb,0x05,A,A,A,A]
+; SSE-NEXT:    ## fixup A - offset: 4, value: LCPI11_0, kind: FK_Data_4
+; SSE-NEXT:    por %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xeb,0xc2]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_paddus_w_64:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
+; AVX2-NEXT:    vpblendw $170, %xmm2, %xmm0, %xmm3 ## encoding: [0xc4,0xe3,0x79,0x0e,0xda,0xaa]
+; AVX2-NEXT:    ## xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfe,0xc1]
+; AVX2-NEXT:    vpblendw $170, %xmm2, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0e,0xca,0xaa]
+; AVX2-NEXT:    ## xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm1 ## encoding: [0xc5,0xe1,0x66,0xc9]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; AVX2-NEXT:    ## encoding: [0xc4,0xe2,0x79,0x18,0x15,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 5, value: LCPI11_0, kind: FK_Data_4
+; AVX2-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4a,0xc2,0x10]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_paddus_w_64:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
+; SKX-NEXT:    vpblendw $170, %xmm2, %xmm0, %xmm3 ## encoding: [0xc4,0xe3,0x79,0x0e,0xda,0xaa]
+; SKX-NEXT:    ## xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
+; SKX-NEXT:    vpblendw $170, %xmm2, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0e,0xca,0xaa]
+; SKX-NEXT:    ## xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SKX-NEXT:    vpcmpnleud %xmm1, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1e,0xc9,0x06]
+; SKX-NEXT:    vpbroadcastd LCPI11_0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0x05,A,A,A,A]
+; SKX-NEXT:    ## fixup A - offset: 6, value: LCPI11_0, kind: FK_Data_4
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %1 = add <4 x i16> %a0, %a1
+  %2 = icmp ugt <4 x i16> %a0, %1
+  %3 = select <4 x i1> %2, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> %1
+  ret <4 x i16> %3
+}
+
+define <8 x i8> @test_x86_sse2_psubs_b_64(<8 x i8> %a0, <8 x i8> %a1) {
+; SSE-LABEL: test_x86_sse2_psubs_b_64:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    psllw $8, %xmm0 ## encoding: [0x66,0x0f,0x71,0xf0,0x08]
+; SSE-NEXT:    psraw $8, %xmm0 ## encoding: [0x66,0x0f,0x71,0xe0,0x08]
+; SSE-NEXT:    psllw $8, %xmm1 ## encoding: [0x66,0x0f,0x71,0xf1,0x08]
+; SSE-NEXT:    psraw $8, %xmm1 ## encoding: [0x66,0x0f,0x71,0xe1,0x08]
+; SSE-NEXT:    psubw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf9,0xc1]
+; SSE-NEXT:    pminsw LCPI12_0, %xmm0 ## encoding: [0x66,0x0f,0xea,0x05,A,A,A,A]
+; SSE-NEXT:    ## fixup A - offset: 4, value: LCPI12_0, kind: FK_Data_4
+; SSE-NEXT:    pmaxsw LCPI12_1, %xmm0 ## encoding: [0x66,0x0f,0xee,0x05,A,A,A,A]
+; SSE-NEXT:    ## fixup A - offset: 4, value: LCPI12_1, kind: FK_Data_4
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_psubs_b_64:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xf0,0x08]
+; AVX2-NEXT:    vpsraw $8, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xe0,0x08]
+; AVX2-NEXT:    vpsllw $8, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x71,0xf1,0x08]
+; AVX2-NEXT:    vpsraw $8, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x71,0xe1,0x08]
+; AVX2-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf9,0xc1]
+; AVX2-NEXT:    vpminsw LCPI12_0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xea,0x05,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI12_0, kind: FK_Data_4
+; AVX2-NEXT:    vpmaxsw LCPI12_1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xee,0x05,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI12_1, kind: FK_Data_4
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_psubs_b_64:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpsllw $8, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x08]
+; SKX-NEXT:    vpsraw $8, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xe0,0x08]
+; SKX-NEXT:    vpsllw $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x71,0xf1,0x08]
+; SKX-NEXT:    vpsraw $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x71,0xe1,0x08]
+; SKX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf9,0xc1]
+; SKX-NEXT:    vpminsw LCPI12_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0x05,A,A,A,A]
+; SKX-NEXT:    ## fixup A - offset: 4, value: LCPI12_0, kind: FK_Data_4
+; SKX-NEXT:    vpmaxsw LCPI12_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0x05,A,A,A,A]
+; SKX-NEXT:    ## fixup A - offset: 4, value: LCPI12_1, kind: FK_Data_4
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %1 = sext <8 x i8> %a0 to <8 x i16>
+  %2 = sext <8 x i8> %a1 to <8 x i16>
+  %3 = sub nsw <8 x i16> %1, %2
+  %4 = icmp slt <8 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <8 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <8 x i16> %7 to <8 x i8>
+  ret <8 x i8> %8
+}
+
+define <4 x i16> @test_x86_sse2_psubs_w_64(<4 x i16> %a0, <4 x i16> %a1) {
+; SSE-LABEL: test_x86_sse2_psubs_w_64:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    pslld $16, %xmm0 ## encoding: [0x66,0x0f,0x72,0xf0,0x10]
+; SSE-NEXT:    psrad $16, %xmm0 ## encoding: [0x66,0x0f,0x72,0xe0,0x10]
+; SSE-NEXT:    pslld $16, %xmm1 ## encoding: [0x66,0x0f,0x72,0xf1,0x10]
+; SSE-NEXT:    psrad $16, %xmm1 ## encoding: [0x66,0x0f,0x72,0xe1,0x10]
+; SSE-NEXT:    psubd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xfa,0xc1]
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767]
+; SSE-NEXT:    ## encoding: [0x66,0x0f,0x6f,0x0d,A,A,A,A]
+; SSE-NEXT:    ## fixup A - offset: 4, value: LCPI13_0, kind: FK_Data_4
+; SSE-NEXT:    movdqa %xmm1, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd1]
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x66,0xd0]
+; SSE-NEXT:    pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2]
+; SSE-NEXT:    pandn %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd1]
+; SSE-NEXT:    por %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xeb,0xd0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528]
+; SSE-NEXT:    ## encoding: [0x66,0x0f,0x6f,0x0d,A,A,A,A]
+; SSE-NEXT:    ## fixup A - offset: 4, value: LCPI13_1, kind: FK_Data_4
+; SSE-NEXT:    movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2]
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x66,0xc1]
+; SSE-NEXT:    pand %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xdb,0xd0]
+; SSE-NEXT:    pandn %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdf,0xc1]
+; SSE-NEXT:    por %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xeb,0xc2]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_psubs_w_64:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpslld $16, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xf0,0x10]
+; AVX2-NEXT:    vpsrad $16, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xe0,0x10]
+; AVX2-NEXT:    vpslld $16, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xf1,0x10]
+; AVX2-NEXT:    vpsrad $16, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xe1,0x10]
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfa,0xc1]
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767]
+; AVX2-NEXT:    ## encoding: [0xc4,0xe2,0x79,0x58,0x0d,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 5, value: LCPI13_0, kind: FK_Data_4
+; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x39,0xc1]
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528]
+; AVX2-NEXT:    ## encoding: [0xc4,0xe2,0x79,0x58,0x0d,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 5, value: LCPI13_1, kind: FK_Data_4
+; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3d,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_psubs_w_64:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpslld $16, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x10]
+; SKX-NEXT:    vpsrad $16, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xe0,0x10]
+; SKX-NEXT:    vpslld $16, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xf1,0x10]
+; SKX-NEXT:    vpsrad $16, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xe1,0x10]
+; SKX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfa,0xc1]
+; SKX-NEXT:    vpminsd LCPI13_0{1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x39,0x05,A,A,A,A]
+; SKX-NEXT:    ## fixup A - offset: 6, value: LCPI13_0, kind: FK_Data_4
+; SKX-NEXT:    vpmaxsd LCPI13_1{1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x3d,0x05,A,A,A,A]
+; SKX-NEXT:    ## fixup A - offset: 6, value: LCPI13_1, kind: FK_Data_4
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %1 = sext <4 x i16> %a0 to <4 x i32>
+  %2 = sext <4 x i16> %a1 to <4 x i32>
+  %3 = sub nsw <4 x i32> %1, %2
+  %4 = icmp slt <4 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <4 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <4 x i32> %7 to <4 x i16>
+  ret <4 x i16> %8
+}
+
+define <8 x i8> @test_x86_sse2_psubus_b_64(<8 x i8> %a0, <8 x i8> %a1) {
+; SSE-LABEL: test_x86_sse2_psubus_b_64:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE-NEXT:    ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A]
+; SSE-NEXT:    ## fixup A - offset: 4, value: LCPI14_0, kind: FK_Data_4
+; SSE-NEXT:    movdqa %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x6f,0xd9]
+; SSE-NEXT:    pand %xmm2, %xmm3 ## encoding: [0x66,0x0f,0xdb,0xda]
+; SSE-NEXT:    pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2]
+; SSE-NEXT:    pmaxsw %xmm3, %xmm0 ## encoding: [0x66,0x0f,0xee,0xc3]
+; SSE-NEXT:    psubw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf9,0xc1]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_psubus_b_64:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI14_0, kind: FK_Data_4
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3 ## encoding: [0xc5,0xf1,0xdb,0xda]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0xc2]
+; AVX2-NEXT:    vpmaxuw %xmm3, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3e,0xc3]
+; AVX2-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf9,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_psubus_b_64:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vmovdqa LCPI14_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SKX-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A]
+; SKX-NEXT:    ## fixup A - offset: 4, value: LCPI14_0, kind: FK_Data_4
+; SKX-NEXT:    vpand %xmm2, %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xda]
+; SKX-NEXT:    vpand %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc2]
+; SKX-NEXT:    vpmaxuw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xc3]
+; SKX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf9,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %cmp = icmp ugt <8 x i8> %a0, %a1
+  %sel = select <8 x i1> %cmp, <8 x i8> %a0, <8 x i8> %a1
+  %sub = sub <8 x i8> %sel, %a1
+  ret <8 x i8> %sub
+}
+
+define <4 x i16> @test_x86_sse2_psubus_w_64(<4 x i16> %a0, <4 x i16> %a1) {
+; SSE-LABEL: test_x86_sse2_psubus_w_64:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
+; SSE-NEXT:    ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A]
+; SSE-NEXT:    ## fixup A - offset: 4, value: LCPI15_0, kind: FK_Data_4
+; SSE-NEXT:    movdqa %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x6f,0xd9]
+; SSE-NEXT:    pand %xmm2, %xmm3 ## encoding: [0x66,0x0f,0xdb,0xda]
+; SSE-NEXT:    pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2]
+; SSE-NEXT:    movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0]
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm2 ## encoding: [0x66,0x0f,0x66,0xd3]
+; SSE-NEXT:    pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2]
+; SSE-NEXT:    pandn %xmm3, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd3]
+; SSE-NEXT:    por %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xeb,0xd0]
+; SSE-NEXT:    psubd %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xfa,0xd1]
+; SSE-NEXT:    movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_psubus_w_64:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
+; AVX2-NEXT:    vpblendw $170, %xmm2, %xmm1, %xmm3 ## encoding: [0xc4,0xe3,0x71,0x0e,0xda,0xaa]
+; AVX2-NEXT:    ## xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX2-NEXT:    vpblendw $170, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xaa]
+; AVX2-NEXT:    ## xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX2-NEXT:    vpmaxud %xmm3, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3f,0xc3]
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfa,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_psubus_w_64:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
+; SKX-NEXT:    vpblendw $170, %xmm2, %xmm1, %xmm3 ## encoding: [0xc4,0xe3,0x71,0x0e,0xda,0xaa]
+; SKX-NEXT:    ## xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SKX-NEXT:    vpblendw $170, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xaa]
+; SKX-NEXT:    ## xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SKX-NEXT:    vpmaxud %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3f,0xc3]
+; SKX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfa,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %cmp = icmp ugt <4 x i16> %a0, %a1
+  %sel = select <4 x i1> %cmp, <4 x i16> %a0, <4 x i16> %a1
+  %sub = sub <4 x i16> %sel, %a1
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_x86_sse2_padds_w_fail(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_x86_sse2_padds_w_fail:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    xorps %xmm0, %xmm0 ## encoding: [0x0f,0x57,0xc0]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_padds_w_fail:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x57,0xc0]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_padds_w_fail:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
+; SKX-NEXT:    vpmovsdw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc0]
+; SKX-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <16 x i8> zeroinitializer, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = sext <8 x i8> %1 to <8 x i32>
+  %3 = add nuw nsw <8 x i32> zeroinitializer, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  ret <8 x i16> %8
+}
+
+define <8 x i16> @test_x86_sse2_psubs_w_fail(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_x86_sse2_psubs_w_fail:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    xorps %xmm0, %xmm0 ## encoding: [0x0f,0x57,0xc0]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_psubs_w_fail:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x57,0xc0]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_psubs_w_fail:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
+; SKX-NEXT:    vpmovsdw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc0]
+; SKX-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <16 x i8> zeroinitializer, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = sext <8 x i8> %1 to <8 x i32>
+  %3 = sub nuw nsw <8 x i32> zeroinitializer, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  ret <8 x i16> %8
+}
Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
===================================================================
--- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -112,8 +112,15 @@
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
-  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %arg0, <16 x i8> %arg1)
-  %bc = bitcast <16 x i8> %res to <2 x i64>
+  %1 = sext <16 x i8> %arg0 to <16 x i16>
+  %2 = sext <16 x i8> %arg1 to <16 x i16>
+  %3 = add nsw <16 x i16> %1, %2
+  %4 = icmp slt <16 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  %bc = bitcast <16 x i8> %8 to <2 x i64>
   ret <2 x i64> %bc
 }
 declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
@@ -130,11 +137,17 @@
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
-  %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %arg0, <8 x i16> %arg1)
-  %bc = bitcast <8 x i16> %res to <2 x i64>
+  %1 = sext <8 x i16> %arg0 to <8 x i32>
+  %2 = sext <8 x i16> %arg1 to <8 x i32>
+  %3 = add nsw <8 x i32> %1, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  %bc = bitcast <8 x i16> %8 to <2 x i64>
   ret <2 x i64> %bc
 }
-declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_adds_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; X32-LABEL: test_mm_adds_epu8:
@@ -148,11 +161,12 @@
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
-  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %arg0, <16 x i8> %arg1)
-  %bc = bitcast <16 x i8> %res to <2 x i64>
+  %1 = add <16 x i8> %arg0, %arg1
+  %2 = icmp ugt <16 x i8> %arg0, %1
+  %3 = select <16 x i1> %2, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %1
+  %bc = bitcast <16 x i8> %3 to <2 x i64>
   ret <2 x i64> %bc
 }
-declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_adds_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; X32-LABEL: test_mm_adds_epu16:
@@ -166,11 +180,12 @@
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
-  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %arg0, <8 x i16> %arg1)
-  %bc = bitcast <8 x i16> %res to <2 x i64>
+  %1 = add <8 x i16> %arg0, %arg1
+  %2 = icmp ugt <8 x i16> %arg0, %1
+  %3 = select <8 x i1> %2, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %1
+  %bc = bitcast <8 x i16> %3 to <2 x i64>
   ret <2 x i64> %bc
 }
-declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <2 x double> @test_mm_and_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
 ; X32-LABEL: test_mm_and_pd:
@@ -3506,11 +3521,17 @@
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
-  %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %arg0, <16 x i8> %arg1)
-  %bc = bitcast <16 x i8> %res to <2 x i64>
+  %1 = sext <16 x i8> %arg0 to <16 x i16>
+  %2 = sext <16 x i8> %arg1 to <16 x i16>
+  %3 = sub nsw <16 x i16> %1, %2
+  %4 = icmp slt <16 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  %bc = bitcast <16 x i8> %8 to <2 x i64>
   ret <2 x i64> %bc
 }
-declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_subs_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; X32-LABEL: test_mm_subs_epi16:
@@ -3524,47 +3545,69 @@
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
-  %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %arg0, <8 x i16> %arg1)
-  %bc = bitcast <8 x i16> %res to <2 x i64>
+  %1 = sext <8 x i16> %arg0 to <8 x i32>
+  %2 = sext <8 x i16> %arg1 to <8 x i32>
+  %3 = sub nsw <8 x i32> %1, %2
+  %4 = icmp slt <8 x i32> %3, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  %bc = bitcast <8 x i16> %8 to <2 x i64>
   ret <2 x i64> %bc
 }
-declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_subs_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; X32-LABEL: test_mm_subs_epu8:
 ; X32:       # %bb.0:
-; X32-NEXT:    psubusb %xmm1, %xmm0
+; X32-NEXT:    pmaxub %xmm1, %xmm0
+; X32-NEXT:    psubb %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_subs_epu8:
 ; X64:       # %bb.0:
-; X64-NEXT:    psubusb %xmm1, %xmm0
+; X64-NEXT:    pmaxub %xmm1, %xmm0
+; X64-NEXT:    psubb %xmm1, %xmm0
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
-  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %arg0, <16 x i8> %arg1)
-  %bc = bitcast <16 x i8> %res to <2 x i64>
+  %cmp = icmp ugt <16 x i8> %arg0, %arg1
+  %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
+  %sub = sub <16 x i8> %sel, %arg1
+  %bc = bitcast <16 x i8> %sub to <2 x i64>
   ret <2 x i64> %bc
 }
-declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_subs_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; X32-LABEL: test_mm_subs_epu16:
 ; X32:       # %bb.0:
-; X32-NEXT:    psubusw %xmm1, %xmm0
+; X32-NEXT:    movdqa .LCPI190_0, %xmm2 # xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X32-NEXT:    movdqa %xmm1, %xmm3
+; X32-NEXT:    pxor %xmm2, %xmm3
+; X32-NEXT:    pxor %xmm2, %xmm0
+; X32-NEXT:    pmaxsw %xmm3, %xmm0
+; X32-NEXT:    pxor %xmm2, %xmm0
+; X32-NEXT:    psubw %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_subs_epu16:
 ; X64:       # %bb.0:
-; X64-NEXT:    psubusw %xmm1, %xmm0
+; X64-NEXT:    movdqa .LCPI190_0(%rip), %xmm2 # xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-NEXT:    movdqa %xmm1, %xmm3
+; X64-NEXT:    pxor %xmm2, %xmm3
+; X64-NEXT:    pxor %xmm2, %xmm0
+; X64-NEXT:    pmaxsw %xmm3, %xmm0
+; X64-NEXT:    pxor %xmm2, %xmm0
+; X64-NEXT:    psubw %xmm1, %xmm0
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
-  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %arg0, <8 x i16> %arg1)
-  %bc = bitcast <8 x i16> %res to <2 x i64>
+  %cmp = icmp ugt <8 x i16> %arg0, %arg1
+  %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
+  %sub = sub <8 x i16> %sel, %arg1
+  %bc = bitcast <8 x i16> %sub to <2 x i64>
   ret <2 x i64> %bc
 }
-declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define i32 @test_mm_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
 ; X32-LABEL: test_mm_ucomieq_sd:
Index: test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
===================================================================
--- test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -316,3 +316,87 @@
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_x86_sse2_paddus_b:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    paddusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdc,0xc1]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_paddus_b:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_paddus_b:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_x86_sse2_paddus_w:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    paddusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdd,0xc1]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_paddus_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_paddus_w:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_x86_sse2_psubus_b:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    psubusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd8,0xc1]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_psubus_b:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd8,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_psubus_b:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_x86_sse2_psubus_w:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    psubusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd9,0xc1]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_psubus_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd9,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_psubus_w:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
Index: test/CodeGen/X86/sse2-intrinsics-x86.ll
===================================================================
--- test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -830,48 +830,6 @@
 declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 
-define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) {
-; SSE-LABEL: test_x86_sse2_paddus_b:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    paddusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdc,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_paddus_b:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_paddus_b:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) {
-; SSE-LABEL: test_x86_sse2_paddus_w:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    paddusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdd,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_paddus_w:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_paddus_w:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-
 define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) {
 ; SSE-LABEL: test_x86_sse2_pmadd_wd:
 ; SSE:       ## %bb.0:
@@ -1434,48 +1392,6 @@
 declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 
-define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) {
-; SSE-LABEL: test_x86_sse2_psubus_b:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    psubusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd8,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_psubus_b:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd8,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_psubus_b:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) {
-; SSE-LABEL: test_x86_sse2_psubus_w:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    psubusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd9,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_psubus_w:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd9,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_psubus_w:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-
 define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
 ; SSE-LABEL: test_x86_sse2_sqrt_pd:
 ; SSE:       ## %bb.0: