Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -23137,12 +23137,36 @@
   return SDValue();
 }
 
+// Patterns emitted from IR might have saturation of count operand
+// because shifting an element by more than its bitsize creates a poison value.
+// We can get rid of it here.
+static SDValue
+RemoveSatFromScalarVarShift(SDValue Op, SelectionDAG &DAG,
+                                        bool IsArith) {
+  MVT VT = Op.getSimpleValueType();
+  unsigned EltSize = VT.getSizeInBits();
+
+  if (Op.getOpcode() == ISD::TRUNCATE ||
+      Op.getOpcode() == ISD::ZERO_EXTEND)
+    Op = Op.getOperand(0);
+
+  if (Op.getOpcode() == ISD::UMIN) {
+    auto *ConstVal = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (ConstVal && ((!IsArith && ConstVal->getZExtValue() == EltSize) ||
+                     (IsArith && ConstVal->getZExtValue() == (EltSize - 1)))) {
+      return Op.getOperand(0);
+    }
+  }
+  return SDValue();
+}
+
 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
+  bool IsArith = Op.getOpcode() == ISD::SRA;
 
   unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
@@ -23160,6 +23184,9 @@
       BaseShAmt = BV->getSplatValue();
       if (BaseShAmt && BaseShAmt.isUndef())
         BaseShAmt = SDValue();
+      else if (BaseShAmt)
+        if (SDValue V = RemoveSatFromScalarVarShift(BaseShAmt, DAG, IsArith))
+          return getTargetVShiftNode(X86OpcI, dl, VT, R, V, Subtarget, DAG);
     } else {
       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
         Amt = Amt.getOperand(0);
@@ -23199,7 +23226,7 @@
   }
 
   // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
-  if (VT == MVT::v2i64  && Amt.getOpcode() == ISD::BITCAST &&
+  if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
     Amt = Amt.getOperand(0);
     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
@@ -33257,6 +33284,167 @@
   return SDValue();
 }
 
+// Canonicalize pattern created by lowering x86 intrinsic.
+// Lowering function requires ISD::UMIN node to match the pattern.
+// SelectionDAG created from IR sometimes includes
+// (trunc (select setcc, x, y)) fragment which gets optimized to
+// (select setcc (trunc x), (trunc y))
+// which prevents combining to smin node.
+// This function forces select combine to be done first.
+// In case of variable shift, remove the smin node (it was only needed in IR).
+static SDValue
+foldShiftArithmeticIntrinsicPattern(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode != ISD::SRA)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  if (!VT.isVector())
+    return SDValue();
+
+  EVT SVT = VT.getVectorElementType();
+  unsigned EltSize = VT.getScalarSizeInBits();
+  SDLoc DL(N);
+
+  if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N1)) {
+    ArrayRef<int> Mask = SVN->getMask();
+    for (unsigned i = 0; i < Mask.size(); ++i)
+      if (Mask[i] != 0)
+        return SDValue();
+
+    SDValue Insert = N1.getOperand(0);
+    if (Insert.getOpcode() != ISD::INSERT_VECTOR_ELT ||
+        N1.getOperand(1).getOpcode() != ISD::UNDEF ||
+        Insert.getOperand(0).getOpcode() != ISD::UNDEF)
+      return SDValue();
+
+    auto *ConstZero = dyn_cast<ConstantSDNode>(Insert.getOperand(2));
+    if (!ConstZero || !ConstZero->isNullValue())
+      return SDValue();
+ 
+    SDValue Tmp = Insert.getOperand(1);
+    SDValue BaseShAmt = Tmp;
+    if (Tmp.getOpcode() == ISD::TRUNCATE ||
+        Tmp.getOpcode() == ISD::ZERO_EXTEND)
+      BaseShAmt = Tmp.getOperand(0);
+
+    if (BaseShAmt.getOpcode() != ISD::SELECT ||
+        BaseShAmt.getOperand(0).getOpcode() != ISD::SETCC ||
+        BaseShAmt.getOperand(1) != BaseShAmt.getOperand(0).getOperand(0) ||
+        BaseShAmt.getOperand(2) != BaseShAmt.getOperand(0).getOperand(1))
+      return SDValue();
+
+    ISD::CondCode CC =
+      cast<CondCodeSDNode>(BaseShAmt.getOperand(0).getOperand(2))->get();
+    auto *ConstVal = dyn_cast<ConstantSDNode>(BaseShAmt.getOperand(2));
+    if (CC != ISD::SETULT ||
+        ConstVal->getZExtValue() != (uint64_t)(EltSize - 1))
+      return SDValue();
+
+    MVT BSATy = BaseShAmt.getSimpleValueType();
+    BaseShAmt = DAG.getNode(ISD::UMIN, DL, BSATy, BaseShAmt.getOperand(1),
+                            BaseShAmt.getOperand(2));
+
+    if (Tmp.getOpcode() != ISD::SELECT)
+      BaseShAmt = DAG.getNode(Tmp.getOpcode(), DL, SVT, BaseShAmt);
+
+    Insert = DAG.getNode(Insert.getOpcode(), DL, VT, Insert.getOperand(0),
+                         BaseShAmt, Insert.getOperand(2));
+
+    N1 = DAG.getVectorShuffle(VT, DL, Insert, N1.getOperand(1), Mask);
+    return DAG.getNode(ISD::SRA, DL, VT, N0, N1);
+  } else if (N1.getOpcode() == ISD::UMIN) {
+    APInt SplatValue;
+    if (ISD::isConstantSplatVector(N1.getOperand(1).getNode(), SplatValue) &&
+        SplatValue.getZExtValue() == (uint64_t)(EltSize - 1)) {
+      N1 = N1.getOperand(0);
+      return DAG.getNode(ISD::SRA, DL, VT, N0, N1);
+    }
+  }
+
+  return SDValue();
+}
+
+// Canonicalize pattern created by lowering x86 intrinsic.
+// Lowering function requires ISD::SMIN node to match the pattern.
+// fold (shl (select (setcc y, max, lt), xvec, zerovec),
+//           (select (setcc y, max, lt), yvec, zerovec)
+//   -> (shl xvec, (smin y, max)vec)
+// or, in case of variable shift:
+// fold (shl (vselect (setcc yvec, maxvec, lt), xvec, zerovec),
+//           (vselect (setcc yvec, maxvec, lt), yvec, zerovec)
+//   -> (shl xvec, yvec)
+static SDValue foldShiftLogicalIntrinsicPattern(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode != ISD::SHL && Opcode != ISD::SRL)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  if (!VT.isVector())
+    return SDValue();
+
+  EVT SVT = VT.getVectorElementType();
+  unsigned OpSizeInBits = VT.getScalarSizeInBits();
+  SDLoc DL(N);
+
+  auto CheckCommonPart = [](SDValue N0, SDValue N1) {
+    SDValue SETCC0 = N0.getOperand(0);
+    SDValue SETCC1 = N1.getOperand(0);
+    if (SETCC0.getOpcode() != ISD::SETCC || SETCC0 != SETCC1)
+      return false;
+
+    if (cast<CondCodeSDNode>(SETCC0.getOperand(2))->get() != ISD::SETULT)
+      return false;
+
+    if (!ISD::isBuildVectorAllZeros(N0.getOperand(2).getNode()) ||
+        !ISD::isBuildVectorAllZeros(N1.getOperand(2).getNode()))
+      return false;
+
+    return true;
+  };
+
+  if (N0.getOpcode() == ISD::SELECT && N1.getOpcode() == ISD::SELECT) {
+    if (!CheckCommonPart(N0, N1))
+      return SDValue();
+ 
+    SDValue SETCC0 =  N0.getOperand(0);
+    auto *MaxNode = dyn_cast<ConstantSDNode>(SETCC0.getOperand(1).getNode());
+    if (!MaxNode || MaxNode->getZExtValue() != (uint64_t)OpSizeInBits)
+      return SDValue();
+
+    SDValue LHS = N0.getOperand(1);
+
+    SDValue Count = SETCC0.getOperand(0);
+    SDValue Max = SETCC0.getOperand(1);
+    EVT CountTy = Count.getValueType();
+    Count = DAG.getNode(ISD::UMIN, DL, CountTy, Count, Max);
+    Count = DAG.getZExtOrTrunc(Count, DL, SVT);
+    SDValue RHS = DAG.getSplatBuildVector(VT, DL, Count);
+
+    return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+  } else if (N0.getOpcode() == ISD::VSELECT && N1.getOpcode() == ISD::VSELECT) {
+    if (!CheckCommonPart(N0, N1))
+      return SDValue();
+
+    SDValue Max =  N0.getOperand(0).getOperand(1);
+    APInt SplatValue;
+    if (!ISD::isConstantSplatVector(Max.getNode(), SplatValue) ||
+        SplatValue != APInt(OpSizeInBits, OpSizeInBits))
+      return SDValue();
+
+    SDValue LHS = N0.getOperand(1);
+    SDValue RHS = N1.getOperand(1);
+
+    return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+  }
+
+  return SDValue();
+}
+
 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -33313,6 +33501,10 @@
         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
     }
 
+  // Try to fold pattern emitted by lowering x86 intrinsic.
+  if (SDValue V = foldShiftLogicalIntrinsicPattern(N, DAG))
+    return V;
+
   return SDValue();
 }
 
@@ -33322,6 +33514,10 @@
   EVT VT = N0.getValueType();
   unsigned Size = VT.getSizeInBits();
 
+  // Try to fold pattern emitted by lowering x86 intrinsic.
+  if (SDValue V = foldShiftArithmeticIntrinsicPattern(N, DAG))
+    return V;
+
   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
@@ -33375,6 +33571,10 @@
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
 
+  // Try to fold pattern emitted by lowering x86 intrinsic.
+  if (SDValue V = foldShiftLogicalIntrinsicPattern(N, DAG))
+    return V;
+
   // Only do this on the last DAG combine as it can interfere with other
   // combines.
   if (!DCI.isAfterLegalizeDAG())
Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -247,79 +247,138 @@
                                   InstCombiner::BuilderTy &Builder) {
   bool LogicalShift = false;
   bool ShiftLeft = false;
+  bool IsCountOperandInteger = false;
 
   switch (II.getIntrinsicID()) {
   default: llvm_unreachable("Unexpected intrinsic!");
   case Intrinsic::x86_sse2_psra_d:
   case Intrinsic::x86_sse2_psra_w:
-  case Intrinsic::x86_sse2_psrai_d:
-  case Intrinsic::x86_sse2_psrai_w:
   case Intrinsic::x86_avx2_psra_d:
   case Intrinsic::x86_avx2_psra_w:
-  case Intrinsic::x86_avx2_psrai_d:
-  case Intrinsic::x86_avx2_psrai_w:
   case Intrinsic::x86_avx512_psra_q_128:
-  case Intrinsic::x86_avx512_psrai_q_128:
   case Intrinsic::x86_avx512_psra_q_256:
-  case Intrinsic::x86_avx512_psrai_q_256:
   case Intrinsic::x86_avx512_psra_d_512:
   case Intrinsic::x86_avx512_psra_q_512:
   case Intrinsic::x86_avx512_psra_w_512:
+    LogicalShift = false; ShiftLeft = false; IsCountOperandInteger = false;
+    break;
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx512_psrai_q_128:
+  case Intrinsic::x86_avx512_psrai_q_256:
   case Intrinsic::x86_avx512_psrai_d_512:
   case Intrinsic::x86_avx512_psrai_q_512:
   case Intrinsic::x86_avx512_psrai_w_512:
-    LogicalShift = false; ShiftLeft = false;
+    LogicalShift = false; ShiftLeft = false; IsCountOperandInteger = true;
     break;
   case Intrinsic::x86_sse2_psrl_d:
   case Intrinsic::x86_sse2_psrl_q:
   case Intrinsic::x86_sse2_psrl_w:
-  case Intrinsic::x86_sse2_psrli_d:
-  case Intrinsic::x86_sse2_psrli_q:
-  case Intrinsic::x86_sse2_psrli_w:
   case Intrinsic::x86_avx2_psrl_d:
   case Intrinsic::x86_avx2_psrl_q:
   case Intrinsic::x86_avx2_psrl_w:
-  case Intrinsic::x86_avx2_psrli_d:
-  case Intrinsic::x86_avx2_psrli_q:
-  case Intrinsic::x86_avx2_psrli_w:
   case Intrinsic::x86_avx512_psrl_d_512:
   case Intrinsic::x86_avx512_psrl_q_512:
   case Intrinsic::x86_avx512_psrl_w_512:
+    LogicalShift = true; ShiftLeft = false; IsCountOperandInteger = false;
+    break;
+  case Intrinsic::x86_sse2_psrli_d:
+  case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_sse2_psrli_w:
+  case Intrinsic::x86_avx2_psrli_d:
+  case Intrinsic::x86_avx2_psrli_q:
+  case Intrinsic::x86_avx2_psrli_w:
   case Intrinsic::x86_avx512_psrli_d_512:
   case Intrinsic::x86_avx512_psrli_q_512:
   case Intrinsic::x86_avx512_psrli_w_512:
-    LogicalShift = true; ShiftLeft = false;
+    LogicalShift = true; ShiftLeft = false; IsCountOperandInteger = true;
     break;
   case Intrinsic::x86_sse2_psll_d:
   case Intrinsic::x86_sse2_psll_q:
   case Intrinsic::x86_sse2_psll_w:
-  case Intrinsic::x86_sse2_pslli_d:
-  case Intrinsic::x86_sse2_pslli_q:
-  case Intrinsic::x86_sse2_pslli_w:
   case Intrinsic::x86_avx2_psll_d:
   case Intrinsic::x86_avx2_psll_q:
   case Intrinsic::x86_avx2_psll_w:
-  case Intrinsic::x86_avx2_pslli_d:
-  case Intrinsic::x86_avx2_pslli_q:
-  case Intrinsic::x86_avx2_pslli_w:
   case Intrinsic::x86_avx512_psll_d_512:
   case Intrinsic::x86_avx512_psll_q_512:
   case Intrinsic::x86_avx512_psll_w_512:
+    LogicalShift = true; ShiftLeft = true; IsCountOperandInteger = false;
+    break;
+  case Intrinsic::x86_sse2_pslli_d:
+  case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_sse2_pslli_w:
+  case Intrinsic::x86_avx2_pslli_d:
+  case Intrinsic::x86_avx2_pslli_q:
+  case Intrinsic::x86_avx2_pslli_w:
   case Intrinsic::x86_avx512_pslli_d_512:
   case Intrinsic::x86_avx512_pslli_q_512:
   case Intrinsic::x86_avx512_pslli_w_512:
-    LogicalShift = true; ShiftLeft = true;
+    LogicalShift = true; ShiftLeft = true; IsCountOperandInteger = true;
     break;
   }
   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
 
   // Simplify if count is constant.
+  auto Vec = II.getArgOperand(0);
   auto Arg1 = II.getArgOperand(1);
+  auto VT = cast<VectorType>(Vec->getType());
+  auto SVT = VT->getElementType();
+  unsigned VWidth = VT->getNumElements();
+  unsigned BitWidth = SVT->getPrimitiveSizeInBits();
+
   auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1);
   auto CDV = dyn_cast<ConstantDataVector>(Arg1);
   auto CInt = dyn_cast<ConstantInt>(Arg1);
-  if (!CAZ && !CDV && !CInt)
-    return nullptr;
+
+  if (!CAZ && !CDV && !CInt) {
+    // Get count argument.
+    Value *Count;
+    if (!IsCountOperandInteger) {
+      // Number of bits to be shifted is stored in the lowest 64 bits
+      // of the Arg1 vector.
+      assert(Arg1->getType()->isVectorTy() && "Count argument expected to be \
+                                               of a vector type.");
+      // Retrieve the shift value.
+      VectorType *VTy = VectorType::get(Type::getInt64Ty(II.getContext()), 2);
+      Arg1 = Builder.CreateBitCast(Arg1, VTy);
+      Count = Builder.CreateExtractElement(Arg1, (uint64_t)0);
+    } else {
+      // Number of bits to be shifted is stored in an unsigned integer.
+      assert(Arg1->getType()->isIntegerTy() && "Count argument expected to be \
+                                                of an integer type.");
+      Count = Arg1;
+    }
+
+    // In IR shift values larger or equal to BitWidth cause
+    // emission of a poison value, so we need to saturate it
+    // to BitWidth (or BitWidth-1 in case of arithmetic shift).
+    Type *CountTy = Count->getType();
+    Value *MaxConstVal = ConstantInt::get(CountTy, BitWidth - 1);
+
+    if (!LogicalShift) {
+      // Handle out of range shifts with min intrinsic.
+      // Shifting by (BitWidth-1) in arithmetic op won't emit a poison value.
+      Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, Count, MaxConstVal);
+      Count = Builder.CreateSelect(Cmp, Count, MaxConstVal);
+      Count = Builder.CreateZExtOrTrunc(Count, SVT);
+      Value *ShiftVec = Builder.CreateVectorSplat(VWidth, Count);
+      return Builder.CreateAShr(Vec, ShiftVec);
+    } else {
+      // Shifting by BitWidth would emit a poison value in logical shift.
+      // In cases where Count >= BitWidth, don't do a shift and
+      // insert a zero value instead.
+      Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULE, Count, MaxConstVal);
+      Count = Builder.CreateZExtOrTrunc(Count, SVT);
+      Value *ShiftVec = Builder.CreateVectorSplat(VWidth, Count);
+      Value *ZeroVec = ConstantAggregateZero::get(VT);
+      Vec = Builder.CreateSelect(Cmp, Vec, ZeroVec);
+      ShiftVec = Builder.CreateSelect(Cmp, ShiftVec, ZeroVec);
+      return ShiftLeft ? Builder.CreateShl(Vec, ShiftVec) :
+                         Builder.CreateLShr(Vec, ShiftVec);
+    }
+  }
 
   APInt Count(64, 0);
   if (CDV) {
@@ -341,12 +400,6 @@
   else if (CInt)
     Count = CInt->getValue();
 
-  auto Vec = II.getArgOperand(0);
-  auto VT = cast<VectorType>(Vec->getType());
-  auto SVT = VT->getElementType();
-  unsigned VWidth = VT->getNumElements();
-  unsigned BitWidth = SVT->getPrimitiveSizeInBits();
-
   // If shift-by-zero then just return the original value.
   if (Count.isNullValue())
     return Vec;
@@ -423,17 +476,46 @@
   }
   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
 
-  // Simplify if all shift amounts are constant/undef.
-  auto *CShift = dyn_cast<Constant>(II.getArgOperand(1));
-  if (!CShift)
-    return nullptr;
-
   auto Vec = II.getArgOperand(0);
+  auto CountVec = II.getArgOperand(1);
   auto VT = cast<VectorType>(II.getType());
   auto SVT = VT->getVectorElementType();
   int NumElts = VT->getNumElements();
   int BitWidth = SVT->getIntegerBitWidth();
+  auto *CShift = dyn_cast<Constant>(II.getArgOperand(1));
+
+  // Simplify if all shift amounts are unknown (not constant/undef).
+  if (!CShift) {
+    auto VecType = dyn_cast<VectorType>(Vec->getType());
+    auto CountVecType = dyn_cast<VectorType>(CountVec->getType());
+    if (VecType != CountVecType)
+      return nullptr;
 
+    // Create vector of constant values (BitWidth - 1).
+    auto ConstVal = ConstantInt::get(SVT, BitWidth - 1);
+    auto ConstantVecMax = Builder.CreateVectorSplat(NumElts, ConstVal);
+
+    if (!LogicalShift) {
+      // Handle out of range shifts with min intrinsic.
+      Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT,
+                                      CountVec, ConstantVecMax);
+      CountVec = Builder.CreateSelect(Cmp, CountVec, ConstantVecMax);
+      return Builder.CreateAShr(Vec, CountVec);
+    } else {
+      // Element being shifted by BitWidth or more would emit
+      // a poison value in LLVM IR.
+      // In that case, a zero value is inserted and shifted by 0 bits.
+      Value *ZeroVec = ConstantAggregateZero::get(VecType);
+      Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULE,
+                                      CountVec, ConstantVecMax);
+      Vec = Builder.CreateSelect(Cmp, Vec, ZeroVec);
+      CountVec = Builder.CreateSelect(Cmp, CountVec, ZeroVec);
+      return ShiftLeft ? Builder.CreateShl(Vec, CountVec) :
+                         Builder.CreateLShr(Vec, CountVec);
+    }
+  }
+
+  // Simplify if all shift amounts are constant/undef.
   // Collect each element's shift amount.
   // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
   bool AnyOutOfRange = false;
Index: test/CodeGen/X86/combine-shl.ll
===================================================================
--- test/CodeGen/X86/combine-shl.ll
+++ test/CodeGen/X86/combine-shl.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-SLOW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512
 
 ; fold (shl 0, x) -> 0
 define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) {
@@ -597,3 +598,290 @@
   %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
   ret <4 x i32> %2
 }
+
+; fold (shl (select (setcc y, c, lt), xvec, zerovec),
+;           (select (setcc y, c, lt), yvec, zerovec))
+;   -> (shl xvec, (smin yvec, maxvec))
+define <2 x i64> @combine_vec_shl_min128(<2 x i64> %x, <2 x i64> %y) {
+; SSE-LABEL: combine_vec_shl_min128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $23, %xmm1
+; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE-NEXT:    pmulld %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_shl_min128:
+; AVX:    # %bb.0:
+; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_shl_min128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <2 x i64> %x to <4 x i32>
+  %2 = bitcast <2 x i64> %y to <4 x i32>
+  %3 = icmp ult <4 x i32> %2, <i32 32, i32 32, i32 32, i32 32>
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> zeroinitializer
+  %5 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer
+  %6 = shl <4 x i32> %4, %5
+  %7 = bitcast <4 x i32> %6 to <2 x i64>
+  ret <2 x i64> %7
+}
+
+define <4 x i64> @combine_vec_shl_min256(<4 x i64> %x, <4 x i64> %y) {
+; SSE-LABEL: combine_vec_shl_min256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $23, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; SSE-NEXT:    paddd %xmm4, %xmm2
+; SSE-NEXT:    cvttps2dq %xmm2, %xmm2
+; SSE-NEXT:    pmulld %xmm2, %xmm0
+; SSE-NEXT:    pslld $23, %xmm3
+; SSE-NEXT:    paddd %xmm4, %xmm3
+; SSE-NEXT:    cvttps2dq %xmm3, %xmm2
+; SSE-NEXT:    pmulld %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_shl_min256:
+; AVX:    # %bb.0:
+; AVX-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_shl_min256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <4 x i64> %x to <8 x i32>
+  %2 = bitcast <4 x i64> %y to <8 x i32>
+  %3 = icmp ult <8 x i32> %2, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> zeroinitializer
+  %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
+  %6 = shl <8 x i32> %4, %5
+  %7 = bitcast <8 x i32> %6 to <4 x i64>
+  ret <4 x i64> %7
+}
+
+define <8 x i64> @combine_vec_shl_min512(<8 x i64> %x, <8 x i64> %y) {
+; SSE-LABEL: combine_vec_shl_min512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $23, %xmm4
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [1065353216,1065353216,1065353216,1065353216]
+; SSE-NEXT:    paddd %xmm8, %xmm4
+; SSE-NEXT:    cvttps2dq %xmm4, %xmm4
+; SSE-NEXT:    pmulld %xmm4, %xmm0
+; SSE-NEXT:    pslld $23, %xmm5
+; SSE-NEXT:    paddd %xmm8, %xmm5
+; SSE-NEXT:    cvttps2dq %xmm5, %xmm4
+; SSE-NEXT:    pmulld %xmm4, %xmm1
+; SSE-NEXT:    pslld $23, %xmm6
+; SSE-NEXT:    paddd %xmm8, %xmm6
+; SSE-NEXT:    cvttps2dq %xmm6, %xmm4
+; SSE-NEXT:    pmulld %xmm4, %xmm2
+; SSE-NEXT:    pslld $23, %xmm7
+; SSE-NEXT:    paddd %xmm8, %xmm7
+; SSE-NEXT:    cvttps2dq %xmm7, %xmm4
+; SSE-NEXT:    pmulld %xmm4, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_shl_min512:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vpsllvd %ymm3, %ymm1, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_min512:
+; AVX512:    # %bb.0:
+; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retq
+  %1 = bitcast <8 x i64> %x to <16 x i32>
+  %2 = bitcast <8 x i64> %y to <16 x i32>
+  %3 = icmp ult <16 x i32> %2, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> zeroinitializer
+  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
+  %6 = shl <16 x i32> %4, %5
+  %7 = bitcast <16 x i32> %6 to <8 x i64>
+  ret <8 x i64> %7
+}
+
+define <2 x i64> @combine_scalar_shl_i_128(<2 x i64> %x, i32 %y) {
+; SSE-LABEL: combine_scalar_shl_i_128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd %edi, %xmm1
+; SSE-NEXT:    pslld %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_shl_i_128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %edi, %xmm1
+; AVX-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_shl_i_128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %edi, %xmm1
+; AVX512-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <2 x i64> %x to <4 x i32>
+  %2 = icmp ult i32 %y, 32
+  %3 = insertelement <4 x i32> undef, i32 %y, i32 0
+  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
+  %5 = select i1 %2, <4 x i32> %1, <4 x i32> zeroinitializer
+  %6 = select i1 %2, <4 x i32> %4, <4 x i32> zeroinitializer
+  %7 = shl <4 x i32> %5, %6
+  %8 = bitcast <4 x i32> %7 to <2 x i64>
+  ret <2 x i64> %8
+}
+
+define <4 x i64> @combine_scalar_shl_i_256(<4 x i64> %x, i32 %y) {
+; SSE-LABEL: combine_scalar_shl_i_256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd %edi, %xmm2
+; SSE-NEXT:    pslld %xmm2, %xmm0
+; SSE-NEXT:    pslld %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_shl_i_256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %edi, %xmm1
+; AVX-NEXT:    vpslld %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_shl_i_256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %edi, %xmm1
+; AVX512-NEXT:    vpslld %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <4 x i64> %x to <8 x i32>
+  %2 = icmp ult i32 %y, 32
+  %3 = insertelement <8 x i32> undef, i32 %y, i32 0
+  %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> zeroinitializer
+  %5 = select i1 %2, <8 x i32> %1, <8 x i32> zeroinitializer
+  %6 = select i1 %2, <8 x i32> %4, <8 x i32> zeroinitializer
+  %7 = shl <8 x i32> %5, %6
+  %8 = bitcast <8 x i32> %7 to <4 x i64>
+  ret <4 x i64> %8
+}
+
+define <8 x i64> @combine_scalar_shl_i_512(<8 x i64> %x, i32 %y) {
+; SSE-LABEL: combine_scalar_shl_i_512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd %edi, %xmm4
+; SSE-NEXT:    pslld %xmm4, %xmm0
+; SSE-NEXT:    pslld %xmm4, %xmm1
+; SSE-NEXT:    pslld %xmm4, %xmm2
+; SSE-NEXT:    pslld %xmm4, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_shl_i_512:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %edi, %xmm2
+; AVX-NEXT:    vpslld %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vpslld %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_shl_i_512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %edi, %xmm1
+; AVX512-NEXT:    vpslld %xmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <8 x i64> %x to <16 x i32>
+  %2 = icmp ult i32 %y, 32
+  %3 = insertelement <16 x i32> undef, i32 %y, i32 0
+  %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> zeroinitializer
+  %5 = select i1 %2, <16 x i32> %1, <16 x i32> zeroinitializer
+  %6 = select i1 %2, <16 x i32> %4, <16 x i32> zeroinitializer
+  %7 = shl <16 x i32> %5, %6
+  %8 = bitcast <16 x i32> %7 to <8 x i64>
+  ret <8 x i64> %8
+}
+
+define <2 x i64> @combine_scalar_shl_128(<2 x i64> %x, <2 x i64> %y) {
+; SSE-LABEL: combine_scalar_shl_128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_shl_128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_shl_128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <2 x i64> %x to <4 x i32>
+  %2 = extractelement <2 x i64> %y, i64 0
+  %3 = icmp ult i64 %2, 32
+  %4 = trunc i64 %2 to i32
+  %5 = insertelement <4 x i32> undef, i32 %4, i32 0
+  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer
+  %7 = select i1 %3, <4 x i32> %1, <4 x i32> zeroinitializer
+  %8 = select i1 %3, <4 x i32> %6, <4 x i32> zeroinitializer
+  %9 = shl <4 x i32> %7, %8
+  %10 = bitcast <4 x i32> %9 to <2 x i64>
+  ret <2 x i64> %10
+}
+
+define <4 x i64> @combine_scalar_shl_256(<4 x i64> %x, <4 x i64> %y) {
+; SSE-LABEL: combine_scalar_shl_256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld %xmm2, %xmm0
+; SSE-NEXT:    pslld %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_shl_256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpslld %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_shl_256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <4 x i64> %x to <8 x i32>
+  %2 = extractelement <4 x i64> %y, i64 0
+  %3 = icmp ult i64 %2, 32
+  %4 = trunc i64 %2 to i32
+  %5 = insertelement <8 x i32> undef, i32 %4, i32 0
+  %6 = shufflevector <8 x i32> %5, <8 x i32> undef, <8 x i32> zeroinitializer
+  %7 = select i1 %3, <8 x i32> %1, <8 x i32> zeroinitializer
+  %8 = select i1 %3, <8 x i32> %6, <8 x i32> zeroinitializer
+  %9 = shl <8 x i32> %7, %8
+  %10 = bitcast <8 x i32> %9 to <4 x i64>
+  ret <4 x i64> %10
+}
+
+define <8 x i64> @combine_scalar_shl_512(<8 x i64> %x, <8 x i64> %y) {
+; SSE-LABEL: combine_scalar_shl_512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld %xmm4, %xmm0
+; SSE-NEXT:    pslld %xmm4, %xmm1
+; SSE-NEXT:    pslld %xmm4, %xmm2
+; SSE-NEXT:    pslld %xmm4, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_shl_512:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpslld %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vpslld %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_shl_512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld %xmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <8 x i64> %x to <16 x i32>
+  %2 = extractelement <8 x i64> %y, i64 0
+  %3 = icmp ult i64 %2, 32
+  %4 = trunc i64 %2 to i32
+  %5 = insertelement <16 x i32> undef, i32 %4, i32 0
+  %6 = shufflevector <16 x i32> %5, <16 x i32> undef, <16 x i32> zeroinitializer
+  %7 = select i1 %3, <16 x i32> %1, <16 x i32> zeroinitializer
+  %8 = select i1 %3, <16 x i32> %6, <16 x i32> zeroinitializer
+  %9 = shl <16 x i32> %7, %8
+  %10 = bitcast <16 x i32> %9 to <8 x i64>
+  ret <8 x i64> %10
+}
Index: test/CodeGen/X86/combine-sra.ll
===================================================================
--- test/CodeGen/X86/combine-sra.ll
+++ test/CodeGen/X86/combine-sra.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512
 
 ; fold (sra 0, x) -> 0
 define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) {
@@ -270,6 +271,368 @@
   ret <4 x i32> %3
 }
 
+; fold (sra x, (min y, c)) -> (sra x, y)
+;      if c is vector of constants equal (bitsize - 1) of y elements
+define <2 x i64> @combine_vec_ashr_min128(<2 x i64> %x, <2 x i64> %y) {
+; SSE-LABEL: combine_vec_ashr_min128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psrad %xmm2, %xmm3
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrlq $32, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm4
+; SSE-NEXT:    psrad %xmm2, %xmm4
+; SSE-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    psrad %xmm1, %xmm2
+; SSE-NEXT:    psrad %xmm3, %xmm0
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_ashr_min128:
+; AVX:    # %bb.0:
+; AVX-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_ashr_min128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <2 x i64> %x to <4 x i32>
+  %2 = bitcast <2 x i64> %y to <4 x i32>
+  %3 = icmp ult <4 x i32> %2, <i32 31, i32 31, i32 31, i32 31>
+  %4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> <i32 31, i32 31, i32 31, i32 31>
+  %5 = ashr <4 x i32> %1, %4
+  %6 = bitcast <4 x i32> %5 to <2 x i64>
+  ret <2 x i64> %6
+}
+
+define <4 x i64> @combine_vec_ashr_min256(<4 x i64> %x, <4 x i64> %y) {
+; SSE-LABEL: combine_vec_ashr_min256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    psrad %xmm4, %xmm5
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    psrlq $32, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm6
+; SSE-NEXT:    psrad %xmm4, %xmm6
+; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE-NEXT:    movdqa %xmm0, %xmm7
+; SSE-NEXT:    psrad %xmm2, %xmm7
+; SSE-NEXT:    psrad %xmm5, %xmm0
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4,5,6,7]
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; SSE-NEXT:    movdqa %xmm3, %xmm2
+; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    movdqa %xmm1, %xmm5
+; SSE-NEXT:    psrad %xmm2, %xmm5
+; SSE-NEXT:    movdqa %xmm3, %xmm2
+; SSE-NEXT:    psrlq $32, %xmm2
+; SSE-NEXT:    movdqa %xmm1, %xmm6
+; SSE-NEXT:    psrad %xmm2, %xmm6
+; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; SSE-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    psrad %xmm3, %xmm4
+; SSE-NEXT:    psrad %xmm2, %xmm1
+; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
+; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_ashr_min256:
+; AVX:    # %bb.0:
+; AVX-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_ashr_min256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <4 x i64> %x to <8 x i32>
+  %2 = bitcast <4 x i64> %y to <8 x i32>
+  %3 = icmp ult <8 x i32> %2, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %5 = ashr <8 x i32> %1, %4
+  %6 = bitcast <8 x i32> %5 to <4 x i64>
+  ret <4 x i64> %6
+}
+
+define <8 x i64> @combine_vec_ashr_min512(<8 x i64> %x, <8 x i64> %y) {
+; SSE-LABEL: combine_vec_ashr_min512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm4, %xmm8
+; SSE-NEXT:    psrldq {{.*#+}} xmm8 = xmm8[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    movdqa %xmm0, %xmm9
+; SSE-NEXT:    psrad %xmm8, %xmm9
+; SSE-NEXT:    movdqa %xmm4, %xmm8
+; SSE-NEXT:    psrlq $32, %xmm8
+; SSE-NEXT:    movdqa %xmm0, %xmm10
+; SSE-NEXT:    psrad %xmm8, %xmm10
+; SSE-NEXT:    pblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm9[4,5,6,7]
+; SSE-NEXT:    pxor %xmm8, %xmm8
+; SSE-NEXT:    pmovzxdq {{.*#+}} xmm9 = xmm4[0],zero,xmm4[1],zero
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3]
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    psrad %xmm4, %xmm11
+; SSE-NEXT:    psrad %xmm9, %xmm0
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4,5,6,7]
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3],xmm0[4,5],xmm10[6,7]
+; SSE-NEXT:    movdqa %xmm5, %xmm4
+; SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    movdqa %xmm1, %xmm9
+; SSE-NEXT:    psrad %xmm4, %xmm9
+; SSE-NEXT:    movdqa %xmm5, %xmm10
+; SSE-NEXT:    psrlq $32, %xmm10
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    psrad %xmm10, %xmm4
+; SSE-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm9[4,5,6,7]
+; SSE-NEXT:    pmovzxdq {{.*#+}} xmm9 = xmm5[0],zero,xmm5[1],zero
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    psrad %xmm5, %xmm10
+; SSE-NEXT:    psrad %xmm9, %xmm1
+; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm10[4,5,6,7]
+; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; SSE-NEXT:    movdqa %xmm6, %xmm4
+; SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    psrad %xmm4, %xmm9
+; SSE-NEXT:    movdqa %xmm6, %xmm4
+; SSE-NEXT:    psrlq $32, %xmm4
+; SSE-NEXT:    movdqa %xmm2, %xmm5
+; SSE-NEXT:    psrad %xmm4, %xmm5
+; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm9[4,5,6,7]
+; SSE-NEXT:    pmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm8[2],xmm6[3],xmm8[3]
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    psrad %xmm6, %xmm4
+; SSE-NEXT:    psrad %xmm9, %xmm2
+; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
+; SSE-NEXT:    movdqa %xmm7, %xmm4
+; SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    movdqa %xmm3, %xmm5
+; SSE-NEXT:    psrad %xmm4, %xmm5
+; SSE-NEXT:    movdqa %xmm7, %xmm4
+; SSE-NEXT:    psrlq $32, %xmm4
+; SSE-NEXT:    movdqa %xmm3, %xmm6
+; SSE-NEXT:    psrad %xmm4, %xmm6
+; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; SSE-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; SSE-NEXT:    movdqa %xmm3, %xmm5
+; SSE-NEXT:    psrad %xmm7, %xmm5
+; SSE-NEXT:    psrad %xmm4, %xmm3
+; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_ashr_min512:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsravd %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vpsravd %ymm3, %ymm1, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_ashr_min512:
+; AVX512:    # %bb.0:
+; AVX512-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retq
+  %1 = bitcast <8 x i64> %x to <16 x i32>
+  %2 = bitcast <8 x i64> %y to <16 x i32>
+  %3 = icmp ult <16 x i32> %2, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %5 = ashr <16 x i32> %1, %4
+  %6 = bitcast <16 x i32> %5 to <8 x i64>
+  ret <8 x i64> %6
+}
+
+define <2 x i64> @combine_scalar_ashr_i_128(<2 x i64> %x, i32 %y) {
+; SSE-LABEL: combine_scalar_ashr_i_128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd %edi, %xmm1
+; SSE-NEXT:    psrad %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_ashr_i_128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %edi, %xmm1
+; AVX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_ashr_i_128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %edi, %xmm1
+; AVX512-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <2 x i64> %x to <4 x i32>
+  %2 = icmp ult i32 %y, 31
+  %3 = select i1 %2, i32 %y, i32 31
+  %4 = insertelement <4 x i32> undef, i32 %3, i32 0
+  %5 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
+  %6 = ashr <4 x i32> %1, %5
+  %7 = bitcast <4 x i32> %6 to <2 x i64>
+  ret <2 x i64> %7
+}
+
+define <4 x i64> @combine_scalar_ashr_i_256(<4 x i64> %x, i32 %y) {
+; SSE-LABEL: combine_scalar_ashr_i_256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd %edi, %xmm2
+; SSE-NEXT:    psrad %xmm2, %xmm0
+; SSE-NEXT:    psrad %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_ashr_i_256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %edi, %xmm1
+; AVX-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_ashr_i_256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %edi, %xmm1
+; AVX512-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <4 x i64> %x to <8 x i32>
+  %2 = icmp ult i32 %y, 31
+  %3 = select i1 %2, i32 %y, i32 31
+  %4 = insertelement <8 x i32> undef, i32 %3, i32 0
+  %5 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> zeroinitializer
+  %6 = ashr <8 x i32> %1, %5
+  %7 = bitcast <8 x i32> %6 to <4 x i64>
+  ret <4 x i64> %7
+}
+
+define <8 x i64> @combine_scalar_ashr_i_512(<8 x i64> %x, i32 %y) {
+; SSE-LABEL: combine_scalar_ashr_i_512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd %edi, %xmm4
+; SSE-NEXT:    psrad %xmm4, %xmm0
+; SSE-NEXT:    psrad %xmm4, %xmm1
+; SSE-NEXT:    psrad %xmm4, %xmm2
+; SSE-NEXT:    psrad %xmm4, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_ashr_i_512:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %edi, %xmm2
+; AVX-NEXT:    vpsrad %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vpsrad %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_ashr_i_512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %edi, %xmm1
+; AVX512-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <8 x i64> %x to <16 x i32>
+  %2 = icmp ult i32 %y, 31
+  %3 = select i1 %2, i32 %y, i32 31
+  %4 = insertelement <16 x i32> undef, i32 %3, i32 0
+  %5 = shufflevector <16 x i32> %4, <16 x i32> undef, <16 x i32> zeroinitializer
+  %6 = ashr <16 x i32> %1, %5
+  %7 = bitcast <16 x i32> %6 to <8 x i64>
+  ret <8 x i64> %7
+}
+
+define <2 x i64> @combine_scalar_ashr_128(<2 x i64> %x, <2 x i64> %y) {
+; SSE-LABEL: combine_scalar_ashr_128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrad %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_ashr_128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_ashr_128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <2 x i64> %x to <4 x i32>
+  %2 = extractelement <2 x i64> %y, i64 0
+  %3 = icmp ult i64 %2, 31
+  %4 = select i1 %3, i64 %2, i64 31
+  %5 = trunc i64 %4 to i32
+  %6 = insertelement <4 x i32> undef, i32 %5, i32 0
+  %7 = shufflevector <4 x i32> %6, <4 x i32> undef, <4 x i32> zeroinitializer
+  %8 = ashr <4 x i32> %1, %7
+  %9 = bitcast <4 x i32> %8 to <2 x i64>
+  ret <2 x i64> %9
+}
+
+define <4 x i64> @combine_scalar_ashr_256(<4 x i64> %x, <4 x i64> %y) {
+; SSE-LABEL: combine_scalar_ashr_256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrad %xmm2, %xmm0
+; SSE-NEXT:    psrad %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_ashr_256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_ashr_256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <4 x i64> %x to <8 x i32>
+  %2 = extractelement <4 x i64> %y, i64 0
+  %3 = icmp ult i64 %2, 31
+  %4 = select i1 %3, i64 %2, i64 31
+  %5 = trunc i64 %4 to i32
+  %6 = insertelement <8 x i32> undef, i32 %5, i32 0
+  %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> zeroinitializer
+  %8 = ashr <8 x i32> %1, %7
+  %9 = bitcast <8 x i32> %8 to <4 x i64>
+  ret <4 x i64> %9
+}
+
+define <8 x i64> @combine_scalar_ashr_512(<8 x i64> %x, <8 x i64> %y) {
+; SSE-LABEL: combine_scalar_ashr_512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrad %xmm4, %xmm0
+; SSE-NEXT:    psrad %xmm4, %xmm1
+; SSE-NEXT:    psrad %xmm4, %xmm2
+; SSE-NEXT:    psrad %xmm4, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_ashr_512:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrad %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vpsrad %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_ashr_512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <8 x i64> %x to <16 x i32>
+  %2 = extractelement <8 x i64> %y, i64 0
+  %3 = icmp ult i64 %2, 31
+  %4 = select i1 %3, i64 %2, i64 31
+  %5 = trunc i64 %4 to i32
+  %6 = insertelement <16 x i32> undef, i32 %5, i32 0
+  %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> zeroinitializer
+  %8 = ashr <16 x i32> %1, %7
+  %9 = bitcast <16 x i32> %8 to <8 x i64>
+  ret <8 x i64> %9
+}
+
 ; If the sign bit is known to be zero, switch this to a SRL.
 define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) {
 ; SSE-LABEL: combine_vec_ashr_positive:
Index: test/CodeGen/X86/combine-srl.ll
===================================================================
--- test/CodeGen/X86/combine-srl.ll
+++ test/CodeGen/X86/combine-srl.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512
 
 ; fold (srl 0, x) -> 0
 define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) {
@@ -466,3 +467,375 @@
   %3 = lshr <4 x i32> %x, %2
   ret <4 x i32> %3
 }
+
+; fold (srl (select (setcc y, c, lt), xvec, zerovec),
+;           (select (setcc y, c, lt), yvec, zerovec))
+;   -> (srl xvec, (smin yvec, maxvec))
+define <2 x i64> @combine_vec_srl_min128(<2 x i64> %x, <2 x i64> %y) {
+; SSE-LABEL: combine_vec_srl_min128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psrld %xmm2, %xmm3
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrlq $32, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm4
+; SSE-NEXT:    psrld %xmm2, %xmm4
+; SSE-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    psrld %xmm1, %xmm2
+; SSE-NEXT:    psrld %xmm3, %xmm0
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_srl_min128:
+; AVX:    # %bb.0:
+; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_srl_min128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <2 x i64> %x to <4 x i32>
+  %2 = bitcast <2 x i64> %y to <4 x i32>
+  %3 = icmp ult <4 x i32> %2, <i32 32, i32 32, i32 32, i32 32>
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> zeroinitializer
+  %5 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer
+  %6 = lshr <4 x i32> %4, %5
+  %7 = bitcast <4 x i32> %6 to <2 x i64>
+  ret <2 x i64> %7
+}
+
+define <4 x i64> @combine_vec_srl_min256(<4 x i64> %x, <4 x i64> %y) {
+; SSE-LABEL: combine_vec_srl_min256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    psrld %xmm4, %xmm5
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    psrlq $32, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm6
+; SSE-NEXT:    psrld %xmm4, %xmm6
+; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE-NEXT:    movdqa %xmm0, %xmm7
+; SSE-NEXT:    psrld %xmm2, %xmm7
+; SSE-NEXT:    psrld %xmm5, %xmm0
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4,5,6,7]
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; SSE-NEXT:    movdqa %xmm3, %xmm2
+; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    movdqa %xmm1, %xmm5
+; SSE-NEXT:    psrld %xmm2, %xmm5
+; SSE-NEXT:    movdqa %xmm3, %xmm2
+; SSE-NEXT:    psrlq $32, %xmm2
+; SSE-NEXT:    movdqa %xmm1, %xmm6
+; SSE-NEXT:    psrld %xmm2, %xmm6
+; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; SSE-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    psrld %xmm3, %xmm4
+; SSE-NEXT:    psrld %xmm2, %xmm1
+; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
+; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_srl_min256:
+; AVX:    # %bb.0:
+; AVX-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_srl_min256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <4 x i64> %x to <8 x i32>
+  %2 = bitcast <4 x i64> %y to <8 x i32>
+  %3 = icmp ult <8 x i32> %2, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> zeroinitializer
+  %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
+  %6 = lshr <8 x i32> %4, %5
+  %7 = bitcast <8 x i32> %6 to <4 x i64>
+  ret <4 x i64> %7
+}
+
+define <8 x i64> @combine_vec_srl_min512(<8 x i64> %x, <8 x i64> %y) {
+; SSE-LABEL: combine_vec_srl_min512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm4, %xmm8
+; SSE-NEXT:    psrldq {{.*#+}} xmm8 = xmm8[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    movdqa %xmm0, %xmm9
+; SSE-NEXT:    psrld %xmm8, %xmm9
+; SSE-NEXT:    movdqa %xmm4, %xmm8
+; SSE-NEXT:    psrlq $32, %xmm8
+; SSE-NEXT:    movdqa %xmm0, %xmm10
+; SSE-NEXT:    psrld %xmm8, %xmm10
+; SSE-NEXT:    pblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm9[4,5,6,7]
+; SSE-NEXT:    pxor %xmm8, %xmm8
+; SSE-NEXT:    pmovzxdq {{.*#+}} xmm9 = xmm4[0],zero,xmm4[1],zero
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3]
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    psrld %xmm4, %xmm11
+; SSE-NEXT:    psrld %xmm9, %xmm0
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4,5,6,7]
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3],xmm0[4,5],xmm10[6,7]
+; SSE-NEXT:    movdqa %xmm5, %xmm4
+; SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    movdqa %xmm1, %xmm9
+; SSE-NEXT:    psrld %xmm4, %xmm9
+; SSE-NEXT:    movdqa %xmm5, %xmm10
+; SSE-NEXT:    psrlq $32, %xmm10
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    psrld %xmm10, %xmm4
+; SSE-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm9[4,5,6,7]
+; SSE-NEXT:    pmovzxdq {{.*#+}} xmm9 = xmm5[0],zero,xmm5[1],zero
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    psrld %xmm5, %xmm10
+; SSE-NEXT:    psrld %xmm9, %xmm1
+; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm10[4,5,6,7]
+; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; SSE-NEXT:    movdqa %xmm6, %xmm4
+; SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    psrld %xmm4, %xmm9
+; SSE-NEXT:    movdqa %xmm6, %xmm4
+; SSE-NEXT:    psrlq $32, %xmm4
+; SSE-NEXT:    movdqa %xmm2, %xmm5
+; SSE-NEXT:    psrld %xmm4, %xmm5
+; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm9[4,5,6,7]
+; SSE-NEXT:    pmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm8[2],xmm6[3],xmm8[3]
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    psrld %xmm6, %xmm4
+; SSE-NEXT:    psrld %xmm9, %xmm2
+; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
+; SSE-NEXT:    movdqa %xmm7, %xmm4
+; SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    movdqa %xmm3, %xmm5
+; SSE-NEXT:    psrld %xmm4, %xmm5
+; SSE-NEXT:    movdqa %xmm7, %xmm4
+; SSE-NEXT:    psrlq $32, %xmm4
+; SSE-NEXT:    movdqa %xmm3, %xmm6
+; SSE-NEXT:    psrld %xmm4, %xmm6
+; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; SSE-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; SSE-NEXT:    movdqa %xmm3, %xmm5
+; SSE-NEXT:    psrld %xmm7, %xmm5
+; SSE-NEXT:    psrld %xmm4, %xmm3
+; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_srl_min512:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vpsrlvd %ymm3, %ymm1, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_srl_min512:
+; AVX512:    # %bb.0:
+; AVX512-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retq
+  %1 = bitcast <8 x i64> %x to <16 x i32>
+  %2 = bitcast <8 x i64> %y to <16 x i32>
+  %3 = icmp ult <16 x i32> %2, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> zeroinitializer
+  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
+  %6 = lshr <16 x i32> %4, %5
+  %7 = bitcast <16 x i32> %6 to <8 x i64>
+  ret <8 x i64> %7
+}
+
+define <2 x i64> @combine_scalar_srl_i_128(<2 x i64> %x, i32 %y) {
+; SSE-LABEL: combine_scalar_srl_i_128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd %edi, %xmm1
+; SSE-NEXT:    psrld %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_srl_i_128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %edi, %xmm1
+; AVX-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_srl_i_128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %edi, %xmm1
+; AVX512-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <2 x i64> %x to <4 x i32>
+  %2 = icmp ult i32 %y, 32
+  %3 = insertelement <4 x i32> undef, i32 %y, i32 0
+  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
+  %5 = select i1 %2, <4 x i32> %1, <4 x i32> zeroinitializer
+  %6 = select i1 %2, <4 x i32> %4, <4 x i32> zeroinitializer
+  %7 = lshr <4 x i32> %5, %6
+  %8 = bitcast <4 x i32> %7 to <2 x i64>
+  ret <2 x i64> %8
+}
+
+define <4 x i64> @combine_scalar_srl_i_256(<4 x i64> %x, i32 %y) {
+; SSE-LABEL: combine_scalar_srl_i_256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd %edi, %xmm2
+; SSE-NEXT:    psrld %xmm2, %xmm0
+; SSE-NEXT:    psrld %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_srl_i_256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %edi, %xmm1
+; AVX-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_srl_i_256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %edi, %xmm1
+; AVX512-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <4 x i64> %x to <8 x i32>
+  %2 = icmp ult i32 %y, 32
+  %3 = insertelement <8 x i32> undef, i32 %y, i32 0
+  %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> zeroinitializer
+  %5 = select i1 %2, <8 x i32> %1, <8 x i32> zeroinitializer
+  %6 = select i1 %2, <8 x i32> %4, <8 x i32> zeroinitializer
+  %7 = lshr <8 x i32> %5, %6
+  %8 = bitcast <8 x i32> %7 to <4 x i64>
+  ret <4 x i64> %8
+}
+
+define <8 x i64> @combine_scalar_srl_i_512(<8 x i64> %x, i32 %y) {
+; SSE-LABEL: combine_scalar_srl_i_512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd %edi, %xmm4
+; SSE-NEXT:    psrld %xmm4, %xmm0
+; SSE-NEXT:    psrld %xmm4, %xmm1
+; SSE-NEXT:    psrld %xmm4, %xmm2
+; SSE-NEXT:    psrld %xmm4, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_srl_i_512:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %edi, %xmm2
+; AVX-NEXT:    vpsrld %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vpsrld %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_srl_i_512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %edi, %xmm1
+; AVX512-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <8 x i64> %x to <16 x i32>
+  %2 = icmp ult i32 %y, 32
+  %3 = insertelement <16 x i32> undef, i32 %y, i32 0
+  %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> zeroinitializer
+  %5 = select i1 %2, <16 x i32> %1, <16 x i32> zeroinitializer
+  %6 = select i1 %2, <16 x i32> %4, <16 x i32> zeroinitializer
+  %7 = lshr <16 x i32> %5, %6
+  %8 = bitcast <16 x i32> %7 to <8 x i64>
+  ret <8 x i64> %8
+}
+
+define <2 x i64> @combine_scalar_srl_128(<2 x i64> %x, <2 x i64> %y) {
+; SSE-LABEL: combine_scalar_srl_128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrld %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_srl_128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_srl_128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <2 x i64> %x to <4 x i32>
+  %2 = extractelement <2 x i64> %y, i64 0
+  %3 = icmp ult i64 %2, 32
+  %4 = trunc i64 %2 to i32
+  %5 = insertelement <4 x i32> undef, i32 %4, i32 0
+  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer
+  %7 = select i1 %3, <4 x i32> %1, <4 x i32> zeroinitializer
+  %8 = select i1 %3, <4 x i32> %6, <4 x i32> zeroinitializer
+  %9 = lshr <4 x i32> %7, %8
+  %10 = bitcast <4 x i32> %9 to <2 x i64>
+  ret <2 x i64> %10
+}
+
+define <4 x i64> @combine_scalar_srl_256(<4 x i64> %x, <4 x i64> %y) {
+; SSE-LABEL: combine_scalar_srl_256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrld %xmm2, %xmm0
+; SSE-NEXT:    psrld %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_srl_256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_srl_256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <4 x i64> %x to <8 x i32>
+  %2 = extractelement <4 x i64> %y, i64 0
+  %3 = icmp ult i64 %2, 32
+  %4 = trunc i64 %2 to i32
+  %5 = insertelement <8 x i32> undef, i32 %4, i32 0
+  %6 = shufflevector <8 x i32> %5, <8 x i32> undef, <8 x i32> zeroinitializer
+  %7 = select i1 %3, <8 x i32> %1, <8 x i32> zeroinitializer
+  %8 = select i1 %3, <8 x i32> %6, <8 x i32> zeroinitializer
+  %9 = lshr <8 x i32> %7, %8
+  %10 = bitcast <8 x i32> %9 to <4 x i64>
+  ret <4 x i64> %10
+}
+
+define <8 x i64> @combine_scalar_srl_512(<8 x i64> %x, <8 x i64> %y) {
+; SSE-LABEL: combine_scalar_srl_512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrld %xmm4, %xmm0
+; SSE-NEXT:    psrld %xmm4, %xmm1
+; SSE-NEXT:    psrld %xmm4, %xmm2
+; SSE-NEXT:    psrld %xmm4, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_scalar_srl_512:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrld %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vpsrld %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: combine_scalar_srl_512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %1 = bitcast <8 x i64> %x to <16 x i32>
+  %2 = extractelement <8 x i64> %y, i64 0
+  %3 = icmp ult i64 %2, 32
+  %4 = trunc i64 %2 to i32
+  %5 = insertelement <16 x i32> undef, i32 %4, i32 0
+  %6 = shufflevector <16 x i32> %5, <16 x i32> undef, <16 x i32> zeroinitializer
+  %7 = select i1 %3, <16 x i32> %1, <16 x i32> zeroinitializer
+  %8 = select i1 %3, <16 x i32> %6, <16 x i32> zeroinitializer
+  %9 = lshr <16 x i32> %7, %8
+  %10 = bitcast <16 x i32> %9 to <8 x i64>
+  ret <8 x i64> %10
+}
Index: test/Transforms/InstCombine/X86/x86-vector-shifts.ll
===================================================================
--- test/Transforms/InstCombine/X86/x86-vector-shifts.ll
+++ test/Transforms/InstCombine/X86/x86-vector-shifts.ll
@@ -2675,13 +2675,153 @@
 }
 
 ;
+; ASHR Unknown Shift Vector
+;
+
+define <2 x i64> @avx2_psrav_d_vec(<2 x i64> %v, <2 x i64> %a) {
+; CHECK-LABEL: @avx2_psrav_d_vec(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> %v to <4 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP2]], <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP2]], <4 x i32> <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT: [[TMP5:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <2 x i64>
+; CHECK-NEXT: ret <2 x i64> [[TMP6]]
+;
+  %1 = bitcast <2 x i64> %v to <4 x i32>
+  %2 = bitcast <2 x i64> %a to <4 x i32>
+  %3 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %1, <4 x i32> %2)
+  %4 = bitcast <4 x i32> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+define <8 x i32> @avx2_psrav_d_256_vec(<8 x i32> %v, <8 x i32> %a) {
+; CHECK-LABEL: @avx2_psrav_d_256_vec(
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <8 x i32> %a, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> %a, <8 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT: [[TMP3:%.*]] = ashr <8 x i32> %v, [[TMP2]]
+; CHECK-NEXT: ret <8 x i32> [[TMP3]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> %a)
+  ret <8 x i32> %1
+}
+
+define <8 x i64> @avx512_psrav_d_512_vec(<8 x i64> %v, <8 x i64> %a) {
+; CHECK-LABEL: @avx512_psrav_d_512_vec(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i64> %v to <16 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> %a to <16 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <16 x i32> [[TMP2]], <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT: [[TMP4:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[TMP2]], <16 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT: [[TMP5:%.*]] = ashr <16 x i32> [[TMP1]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP5]] to <8 x i64>
+; CHECK-NEXT: ret <8 x i64> [[TMP6]]
+;
+  %1 = bitcast <8 x i64> %v to <16 x i32>
+  %2 = bitcast <8 x i64> %a to <16 x i32>
+  %3 = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %1, <16 x i32> %2)
+  %4 = bitcast <16 x i32> %3 to <8 x i64>
+  ret <8 x i64> %4
+}
+
+define <2 x i64> @avx512_psrav_q_128_vec(<2 x i64> %v, <2 x i64> %a) {
+; CHECK-LABEL: @avx512_psrav_q_128_vec(
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i64> %a, <i64 63, i64 63>
+; CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i64> %a, <2 x i64> <i64 63, i64 63>
+; CHECK-NEXT: [[TMP3:%.*]] = ashr <2 x i64> %v, [[TMP2]]
+; CHECK-NEXT:  ret <2 x i64> [[TMP3]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %v, <2 x i64> %a)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @avx512_psrav_q_256_vec(<4 x i64> %v, <4 x i64> %a) {
+; CHECK-LABEL: @avx512_psrav_q_256_vec(
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i64> %a, <i64 63, i64 63, i64 63, i64 63>
+; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> %a, <4 x i64> <i64 63, i64 63, i64 63, i64 63>
+; CHECK-NEXT: [[TMP3:%.*]] = ashr <4 x i64> %v, [[TMP2]]
+; CHECK-NEXT:  ret <4 x i64> [[TMP3]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %v, <4 x i64> %a)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @avx512_psrav_q_512_vec(<8 x i64> %v, <8 x i64> %a) {
+; CHECK-LABEL: @avx512_psrav_q_512_vec(
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <8 x i64> %a, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
+; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> %a, <8 x i64> <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
+; CHECK-NEXT: [[TMP3:%.*]] = ashr <8 x i64> %v, [[TMP2]]
+; ret <8 x i64> [[TMP3]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %v, <8 x i64> %a)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @avx512_psrav_w_128_vec(<2 x i64> %v, <2 x i64> %a) {
+; CHECK-LABEL: @avx512_psrav_w_128_vec(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> %v to <8 x i16>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <8 x i16> [[TMP2]], <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i16> [[TMP2]], <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT: [[TMP5:%.*]] = ashr <8 x i16> [[TMP1]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <2 x i64>
+; CHECK-NEXT: ret <2 x i64> [[TMP6]]
+;
+  %1 = bitcast <2 x i64> %v to <8 x i16>
+  %2 = bitcast <2 x i64> %a to <8 x i16>
+  %3 = call <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16> %1, <8 x i16> %2)
+  %4 = bitcast <8 x i16> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+define <4 x i64> @avx512_psrav_w_256_vec(<4 x i64> %v, <4 x i64> %a) {
+; CHECK_LABEL: @av512_psrav_w_256_vec(
+; CHECK_NEXT: [[TMP1:%.*]] bitcast <4 x i64> %v to <16 x i16>
+; CHECK_NEXT: [[TMP2:%.*]] = bitcast <4 x i64> %a to <16 x i16>
+; CHECK_NEXT: [[TMP3:%.*]] = icmp ult <16 x i16> [[TMP2]], <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK_NEXT: [[TMP4:%.*]] = select <16 x i1> [[TMP3]], <16 x i16> [[TMP2]], <16 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK_NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP4]]
+; CHECK_NEXT: [[TMP6:%.*]] = bitcast <16 x i16> [[TMP5]] to <4 x i64>
+; CHECK_NEXT:   ret <4 x i64> [[TMP6]]
+;
+  %1 = bitcast <4 x i64> %v to <16 x i16>
+  %2 = bitcast <4 x i64> %a to <16 x i16>
+  %3 = call <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16> %1, <16 x i16> %2)
+  %4 = bitcast <16 x i16> %3 to <4 x i64>
+  ret <4 x i64> %4
+}
+
+define <8 x i64> @avx512_psrav_w_512_vec(<8 x i64> %v, <8 x i64> %a) {
+; CHECK-LABEL: @avx512_psrav_w_512_vec(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i64> %v to <32 x i16>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> %a to <32 x i16>
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <32 x i16> [[TMP2]], <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT: [[TMP4:%.*]] = select <32 x i1> [[TMP3]], <32 x i16> [[TMP2]], <32 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT: [[TMP5:%.*]] = ashr <32 x i16> [[TMP1]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <32 x i16> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:  ret <8 x i64> [[TMP6]]
+;
+  %1 = bitcast <8 x i64> %v to <32 x i16>
+  %2 = bitcast <8 x i64> %a to <32 x i16>
+  %3 = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %1, <32 x i16> %2)
+  %4 = bitcast <32 x i16> %3 to <8 x i64>
+  ret <8 x i64> %4
+}
+
+;
 ; Vector Demanded Bits
 ;
 
 define <8 x i16> @sse2_psra_w_var(<8 x i16> %v, <8 x i16> %a) {
 ; CHECK-LABEL: @sse2_psra_w_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 15
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 15
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i16
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[TMP5]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr <8 x i16> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP6]]
 ;
   %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1)
@@ -2690,9 +2830,14 @@
 
 define <8 x i16> @sse2_psra_w_var_bc(<8 x i16> %v, <2 x i64> %a) {
 ; CHECK-LABEL: @sse2_psra_w_var_bc(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> [[TMP1]])
-; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 15
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 15
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i16
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = ashr <8 x i16> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP5]]
 ;
   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   %2 = bitcast <2 x i64> %1 to <8 x i16>
@@ -2702,8 +2847,15 @@
 
 define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) {
 ; CHECK-LABEL: @sse2_psra_d_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 31
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 31
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr <4 x i32> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
 ;
   %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1)
@@ -2712,9 +2864,15 @@
 
 define <4 x i32> @sse2_psra_d_var_bc(<4 x i32> %v, <8 x i16> %a) {
 ; CHECK-LABEL: @sse2_psra_d_var_bc(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> %a to <4 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> [[TMP1]])
-; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 31
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 31
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr <4 x i32> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
 ;
   %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = bitcast <8 x i16> %1 to <4 x i32>
@@ -2724,8 +2882,15 @@
 
 define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) {
 ; CHECK-LABEL: @avx2_psra_w_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 15
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 15
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i16
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> undef, i16 [[TMP5]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP6]]
 ;
   %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %1)
@@ -2734,8 +2899,15 @@
 
 define <8 x i32> @avx2_psra_d_var(<8 x i32> %v, <4 x i32> %a) {
 ; CHECK-LABEL: @avx2_psra_d_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 31
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 31
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP6]]
 ;
   %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %1)
@@ -2744,8 +2916,13 @@
 
 define <2 x i64> @avx512_psra_q_128_var(<2 x i64> %v, <2 x i64> %a) {
 ; CHECK-LABEL: @avx512_psra_q_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 63
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 63
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = ashr <2 x i64> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
 ;
   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   %2 = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> %1)
@@ -2754,8 +2931,13 @@
 
 define <4 x i64> @avx512_psra_q_256_var(<4 x i64> %v, <2 x i64> %a) {
 ; CHECK-LABEL: @avx512_psra_q_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 63
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 63
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[TMP3]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = ashr <4 x i64> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP4]]
 ;
   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   %2 = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> %1)
@@ -2764,8 +2946,15 @@
 
 define <32 x i16> @avx512_psra_w_512_var(<32 x i16> %v, <8 x i16> %a) {
 ; CHECK-LABEL: @avx512_psra_w_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 15
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 15
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i16
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <32 x i16> undef, i16 [[TMP5]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <32 x i16> [[DOTSPLATINSERT]], <32 x i16> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr <32 x i16> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT:    ret <32 x i16> [[TMP6]]
 ;
   %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> %1)
@@ -2774,8 +2963,15 @@
 
 define <16 x i32> @avx512_psra_d_512_var(<16 x i32> %v, <4 x i32> %a) {
 ; CHECK-LABEL: @avx512_psra_d_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 31
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 31
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr <16 x i32> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT:    ret <16 x i32> [[TMP6]]
 ;
   %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> %1)
@@ -2784,8 +2980,13 @@
 
 define <8 x i64> @avx512_psra_q_512_var(<8 x i64> %v, <2 x i64> %a) {
 ; CHECK-LABEL: @avx512_psra_q_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 63
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 63
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[TMP3]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = ashr <8 x i64> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
 ;
   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   %2 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> %1)
@@ -2794,8 +2995,16 @@
 
 define <8 x i16> @sse2_psrl_w_var(<8 x i16> %v, <8 x i16> %a) {
 ; CHECK-LABEL: @sse2_psrl_w_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i16
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], <8 x i16> [[V:%.*]], <8 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], <8 x i16> [[DOTSPLAT]], <8 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr <8 x i16> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP7]]
 ;
   %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %1)
@@ -2804,8 +3013,16 @@
 
 define <4 x i32> @sse2_psrl_d_var(<4 x i32> %v, <4 x i32> %a) {
 ; CHECK-LABEL: @sse2_psrl_d_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], <4 x i32> [[V:%.*]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], <4 x i32> [[DOTSPLAT]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
 ;
   %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %1)
@@ -2814,8 +3031,13 @@
 
 define <2 x i64> @sse2_psrl_q_var(<2 x i64> %v, <2 x i64> %a) {
 ; CHECK-LABEL: @sse2_psrl_q_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 64
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], <2 x i64> [[V:%.*]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], <2 x i64> [[TMP1]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP6]]
 ;
   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   %2 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %1)
@@ -2824,8 +3046,16 @@
 
 define <16 x i16> @avx2_psrl_w_var(<16 x i16> %v, <8 x i16> %a) {
 ; CHECK-LABEL: @avx2_psrl_w_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i16
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], <16 x i16> [[V:%.*]], <16 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], <16 x i16> [[DOTSPLAT]], <16 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr <16 x i16> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP7]]
 ;
   %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1)
@@ -2834,9 +3064,16 @@
 
 define <16 x i16> @avx2_psrl_w_var_bc(<16 x i16> %v, <16 x i8> %a) {
 ; CHECK-LABEL: @avx2_psrl_w_var_bc(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> [[TMP1]])
-; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i16
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], <16 x i16> [[V:%.*]], <16 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], <16 x i16> [[DOTSPLAT]], <16 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr <16 x i16> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP7]]
 ;
   %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = bitcast <16 x i8> %1 to <8 x i16>
@@ -2846,8 +3083,16 @@
 
 define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) {
 ; CHECK-LABEL: @avx2_psrl_d_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], <8 x i32> [[V:%.*]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], <8 x i32> [[DOTSPLAT]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr <8 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP7]]
 ;
   %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1)
@@ -2856,9 +3101,15 @@
 
 define <8 x i32> @avx2_psrl_d_var_bc(<8 x i32> %v, <2 x i64> %a) {
 ; CHECK-LABEL: @avx2_psrl_d_var_bc(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> %a to <4 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> [[TMP1]])
-; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 32
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], <8 x i32> [[V:%.*]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], <8 x i32> [[DOTSPLAT]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP6]]
 ;
   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   %2 = bitcast <2 x i64> %1 to <4 x i32>
@@ -2868,8 +3119,13 @@
 
 define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) {
 ; CHECK-LABEL: @avx2_psrl_q_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 64
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], <4 x i64> [[V:%.*]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], <4 x i64> [[DOTSPLAT]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP5]]
 ;
   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   %2 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %1)
@@ -2878,8 +3134,16 @@
 
 define <32 x i16> @avx512_psrl_w_512_var(<32 x i16> %v, <8 x i16> %a) {
 ; CHECK-LABEL: @avx512_psrl_w_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i16
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <32 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <32 x i16> [[DOTSPLATINSERT]], <32 x i16> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], <32 x i16> [[V:%.*]], <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], <32 x i16> [[DOTSPLAT]], <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr <32 x i16> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <32 x i16> [[TMP7]]
 ;
   %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %1)
@@ -2888,9 +3152,16 @@
 
 define <32 x i16> @avx512_psrl_w_512_var_bc(<32 x i16> %v, <16 x i8> %a) {
 ; CHECK-LABEL: @avx512_psrl_w_512_var_bc(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> [[TMP1]])
-; CHECK-NEXT:    ret <32 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i16
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <32 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <32 x i16> [[DOTSPLATINSERT]], <32 x i16> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], <32 x i16> [[V:%.*]], <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], <32 x i16> [[DOTSPLAT]], <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr <32 x i16> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <32 x i16> [[TMP7]]
 ;
   %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = bitcast <16 x i8> %1 to <8 x i16>
@@ -2900,8 +3171,16 @@
 
 define <16 x i32> @avx512_psrl_d_512_var(<16 x i32> %v, <4 x i32> %a) {
 ; CHECK-LABEL: @avx512_psrl_d_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], <16 x i32> [[V:%.*]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], <16 x i32> [[DOTSPLAT]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr <16 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
 ;
   %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> %1)
@@ -2910,9 +3189,15 @@
 
 define <16 x i32> @avx512_psrl_d_512_var_bc(<16 x i32> %v, <2 x i64> %a) {
 ; CHECK-LABEL: @avx512_psrl_d_512_var_bc(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> %a to <4 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> [[TMP1]])
-; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 32
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], <16 x i32> [[V:%.*]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], <16 x i32> [[DOTSPLAT]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret <16 x i32> [[TMP6]]
 ;
   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   %2 = bitcast <2 x i64> %1 to <4 x i32>
@@ -2922,8 +3207,13 @@
 
 define <8 x i64> @avx512_psrl_q_512_var(<8 x i64> %v, <2 x i64> %a) {
 ; CHECK-LABEL: @avx512_psrl_q_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 64
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], <8 x i64> [[V:%.*]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], <8 x i64> [[DOTSPLAT]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr <8 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP5]]
 ;
   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   %2 = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> %1)
@@ -2932,8 +3222,16 @@
 
 define <8 x i16> @sse2_psll_w_var(<8 x i16> %v, <8 x i16> %a) {
 ; CHECK-LABEL: @sse2_psll_w_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i16
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], <8 x i16> [[V:%.*]], <8 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], <8 x i16> [[DOTSPLAT]], <8 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <8 x i16> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP7]]
 ;
   %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %1)
@@ -2942,8 +3240,16 @@
 
 define <4 x i32> @sse2_psll_d_var(<4 x i32> %v, <4 x i32> %a) {
 ; CHECK-LABEL: @sse2_psll_d_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], <4 x i32> [[V:%.*]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], <4 x i32> [[DOTSPLAT]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
 ;
   %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %1)
@@ -2952,8 +3258,13 @@
 
 define <2 x i64> @sse2_psll_q_var(<2 x i64> %v, <2 x i64> %a) {
 ; CHECK-LABEL: @sse2_psll_q_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 64
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], <2 x i64> [[V:%.*]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], <2 x i64> [[TMP1]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = shl <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP6]]
 ;
   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   %2 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %1)
@@ -2962,8 +3273,16 @@
 
 define <16 x i16> @avx2_psll_w_var(<16 x i16> %v, <8 x i16> %a) {
 ; CHECK-LABEL: @avx2_psll_w_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i16
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], <16 x i16> [[V:%.*]], <16 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], <16 x i16> [[DOTSPLAT]], <16 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <16 x i16> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP7]]
 ;
   %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %1)
@@ -2972,8 +3291,16 @@
 
 define <8 x i32> @avx2_psll_d_var(<8 x i32> %v, <4 x i32> %a) {
 ; CHECK-LABEL: @avx2_psll_d_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], <8 x i32> [[V:%.*]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], <8 x i32> [[DOTSPLAT]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <8 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP7]]
 ;
   %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %1)
@@ -2982,8 +3309,13 @@
 
 define <4 x i64> @avx2_psll_q_var(<4 x i64> %v, <2 x i64> %a) {
 ; CHECK-LABEL: @avx2_psll_q_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 64
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], <4 x i64> [[V:%.*]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], <4 x i64> [[DOTSPLAT]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP5]]
 ;
   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   %2 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %1)
@@ -2992,8 +3324,16 @@
 
 define <32 x i16> @avx512_psll_w_512_var(<32 x i16> %v, <8 x i16> %a) {
 ; CHECK-LABEL: @avx512_psll_w_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i16
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <32 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <32 x i16> [[DOTSPLATINSERT]], <32 x i16> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], <32 x i16> [[V:%.*]], <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], <32 x i16> [[DOTSPLAT]], <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <32 x i16> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <32 x i16> [[TMP7]]
 ;
   %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> %1)
@@ -3002,8 +3342,16 @@
 
 define <16 x i32> @avx512_psll_d_512_var(<16 x i32> %v, <4 x i32> %a) {
 ; CHECK-LABEL: @avx512_psll_d_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], <16 x i32> [[V:%.*]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], <16 x i32> [[DOTSPLAT]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <16 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
 ;
   %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> %1)
@@ -3012,8 +3360,13 @@
 
 define <8 x i64> @avx512_psll_q_512_var(<8 x i64> %v, <2 x i64> %a) {
 ; CHECK-LABEL: @avx512_psll_q_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 64
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], <8 x i64> [[V:%.*]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], <8 x i64> [[DOTSPLAT]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <8 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP5]]
 ;
   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   %2 = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> %1)