Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -1687,6 +1687,8 @@
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::ROTL);
+ setTargetDAGCombine(ISD::ROTR);
setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::ADD);
@@ -33436,6 +33438,26 @@
return SDValue();
}
+static SDValue combineRotate(SDNode* N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+
+ // fold (ROT (X, AND (Y, ESIB-1)) -> (ROT (X, Y))
+ // where ESIB is EltSizeInBits
+ // as X86 machine instructions take care of doing Y % ESIB themselves
+ if (N1.getOpcode() == ISD::AND)
+ if (ConstantSDNode *Const = isConstOrConstSplat(N1.getOperand(1)))
+ if (Const->getZExtValue() == (uint64_t)(EltSizeInBits - 1))
+ return DAG.getNode(N->getOpcode(), DL, VT, N0, N1.getOperand(0));
+
+ return SDValue();
+}
+
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -38645,6 +38667,8 @@
case ISD::SHL:
case ISD::SRA:
case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
+ case ISD::ROTL:
+ case ISD::ROTR: return combineRotate(N, DAG, DCI, Subtarget);
case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1206,6 +1206,99 @@
return nullptr;
}
+static Value *simplifyX86Rotate(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ StringRef Name = Intrinsic::getName(II.getIntrinsicID());
+ if (!Name.consume_front(StringRef("llvm.x86.avx512.mask.pro")))
+ return nullptr;
+
+ bool IsRotLeft = Name[0] == 'l';
+ bool IsVariable = Name[1] == 'v';
+ bool IsEltDW = IsVariable ? Name[3] == 'd' : Name[2] == 'd';
+
+ Intrinsic::ID SLLID;
+ Intrinsic::ID SRLID;
+ if (Name.endswith(".128")) {
+ if (IsEltDW) {
+ SLLID = IsVariable ? Intrinsic::x86_avx2_psllv_d :
+ Intrinsic::x86_sse2_pslli_d;
+ SRLID = IsVariable ? Intrinsic::x86_avx2_psrlv_d :
+ Intrinsic::x86_sse2_psrli_d;
+ } else {
+ SLLID = IsVariable ? Intrinsic::x86_avx2_psllv_q :
+ Intrinsic::x86_sse2_pslli_q;
+ SRLID = IsVariable ? Intrinsic::x86_avx2_psrlv_q :
+ Intrinsic::x86_sse2_psrli_q;
+ }
+ }
+ else if (Name.endswith(".256")) {
+ if (IsEltDW) {
+ SLLID = IsVariable ? Intrinsic::x86_avx2_psllv_d_256 :
+ Intrinsic::x86_avx2_pslli_d;
+ SRLID = IsVariable ? Intrinsic::x86_avx2_psrlv_d_256 :
+ Intrinsic::x86_avx2_psrli_d;
+ } else {
+ SLLID = IsVariable ? Intrinsic::x86_avx2_psllv_q_256 :
+ Intrinsic::x86_avx2_pslli_q;
+ SRLID = IsVariable ? Intrinsic::x86_avx2_psrlv_q_256 :
+ Intrinsic::x86_avx2_psrli_q;
+ }
+ }
+ else {
+ if (IsEltDW) {
+ SLLID = IsVariable ? Intrinsic::x86_avx512_psllv_d_512 :
+ Intrinsic::x86_avx512_pslli_d_512;
+ SRLID = IsVariable ? Intrinsic::x86_avx512_psrlv_d_512 :
+ Intrinsic::x86_avx512_psrli_d_512;
+ } else {
+ SLLID = IsVariable ? Intrinsic::x86_avx512_psllv_q_512 :
+ Intrinsic::x86_avx512_pslli_q_512;
+ SRLID = IsVariable ? Intrinsic::x86_avx512_psrlv_q_512 :
+ Intrinsic::x86_avx512_psrli_q_512;
+ }
+ }
+
+ Function *F = II.getCalledFunction();
+ Function *SHL = Intrinsic::getDeclaration(F->getParent(), SLLID);
+ Function *SHR = Intrinsic::getDeclaration(F->getParent(), SRLID);
+ Value *A = II.getArgOperand(0);
+ Value *Count = II.getArgOperand(1);
+ Value *Src = II.getOperand(2);
+ Value *Mask = II.getOperand(3);
+ Value *Sub = nullptr;
+
+ LLVMContext &C = II.getContext();
+ unsigned EltSize = A->getType()->getScalarSizeInBits();
+ Type *IntType;
+ if (IsVariable)
+ IntType = EltSize == 32 ? Type::getInt32Ty(C) :
+ Type::getInt64Ty(C);
+ else
+ IntType = Type::getInt32Ty(C);
+ Value *Max = EltSize == 32 ? ConstantInt::get(IntType, 32) :
+ ConstantInt::get(IntType, 64);
+
+ // Count = Count % Max
+ // Sub = Max - Count
+ if (IsVariable) {
+ unsigned NumElts = Count->getType()->getVectorNumElements();
+ Value *Vec = Builder.CreateVectorSplat(NumElts, Max);
+ Count = Builder.CreateURem(Count, Vec);
+ Sub = Builder.CreateSub(Vec, Count);
+ } else {
+ Count = Builder.CreateURem(Count, Max);
+ Sub = Builder.CreateSub(Max, Count);
+ }
+
+ // Rol is defined as or (shl (A, Count%Max), shr (A, Max - Count%Max))
+ Value *LeftShift = IsRotLeft ? Builder.CreateCall(SHL, { A, Count }) :
+ Builder.CreateCall(SHL, { A, Sub });
+ Value *RightShift = IsRotLeft ? Builder.CreateCall(SHR, { A, Sub }) :
+ Builder.CreateCall(SHR, { A, Count });
+ Value *Or = Builder.CreateOr(LeftShift, RightShift);
+ return emitX86MaskSelect(Mask, Or, Src, Builder);
+}
+
static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
assert((II.getIntrinsicID() == Intrinsic::cttz ||
II.getIntrinsicID() == Intrinsic::ctlz) &&
@@ -2424,6 +2517,34 @@
break;
}
+ case Intrinsic::x86_avx512_mask_prol_d_128:
+ case Intrinsic::x86_avx512_mask_prol_d_256:
+ case Intrinsic::x86_avx512_mask_prol_d_512:
+ case Intrinsic::x86_avx512_mask_prol_q_128:
+ case Intrinsic::x86_avx512_mask_prol_q_256:
+ case Intrinsic::x86_avx512_mask_prol_q_512:
+ case Intrinsic::x86_avx512_mask_prolv_d_128:
+ case Intrinsic::x86_avx512_mask_prolv_d_256:
+ case Intrinsic::x86_avx512_mask_prolv_d_512:
+ case Intrinsic::x86_avx512_mask_prolv_q_128:
+ case Intrinsic::x86_avx512_mask_prolv_q_256:
+ case Intrinsic::x86_avx512_mask_prolv_q_512:
+ case Intrinsic::x86_avx512_mask_pror_d_128:
+ case Intrinsic::x86_avx512_mask_pror_d_256:
+ case Intrinsic::x86_avx512_mask_pror_d_512:
+ case Intrinsic::x86_avx512_mask_pror_q_128:
+ case Intrinsic::x86_avx512_mask_pror_q_256:
+ case Intrinsic::x86_avx512_mask_pror_q_512:
+ case Intrinsic::x86_avx512_mask_prorv_d_128:
+ case Intrinsic::x86_avx512_mask_prorv_d_256:
+ case Intrinsic::x86_avx512_mask_prorv_d_512:
+ case Intrinsic::x86_avx512_mask_prorv_q_128:
+ case Intrinsic::x86_avx512_mask_prorv_q_256:
+ case Intrinsic::x86_avx512_mask_prorv_q_512:
+ if (Value *V = simplifyX86Rotate(*II, Builder))
+ return replaceInstUsesWith(*II, V);
+ break;
+
// Constant fold ashr( , Ci ).
// Constant fold lshr( , Ci ).
// Constant fold shl( , Ci ).
Index: test/CodeGen/X86/combine-rotates.ll
===================================================================
--- test/CodeGen/X86/combine-rotates.ll
+++ test/CodeGen/X86/combine-rotates.ll
@@ -57,3 +57,712 @@
%6 = or <4 x i32> %4, %5
ret <4 x i32> %6
}
+
+
+define <2 x i64> @combine_var_rol_epi32_128(<2 x i64> %a, <2 x i64> %b) {
+; AVX512-LABEL: combine_var_rol_epi32_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %a0 = bitcast <2 x i64> %a to <4 x i32>
+ %b0 = bitcast <2 x i64> %b to <4 x i32>
+ %1 = and <4 x i32> %b0,
+ %2 = sub nsw <4 x i32> , %1
+ %3 = shl <4 x i32> %a0, %1
+ %4 = icmp ult <4 x i32> %2,
+ %5 = select <4 x i1> %4, <4 x i32> %a0, <4 x i32> zeroinitializer
+ %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer
+ %7 = lshr <4 x i32> %5, %6
+ %8 = or <4 x i32> %7, %3
+ %bc = bitcast <4 x i32> %8 to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <4 x i64> @combine_var_rol_epi32_256(<4 x i64> %a, <4 x i64> %b) {
+; AVX512-LABEL: combine_var_rol_epi32_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprolvd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %a0 = bitcast <4 x i64> %a to <8 x i32>
+ %b0 = bitcast <4 x i64> %b to <8 x i32>
+ %1 = and <8 x i32> %b0,
+ %2 = sub nsw <8 x i32> , %1
+ %3 = shl <8 x i32> %a0, %1
+ %4 = icmp ult <8 x i32> %2,
+ %5 = select <8 x i1> %4, <8 x i32> %a0, <8 x i32> zeroinitializer
+ %6 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> zeroinitializer
+ %7 = lshr <8 x i32> %5, %6
+ %8 = or <8 x i32> %7, %3
+ %bc = bitcast <8 x i32> %8 to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <8 x i64> @combine_var_rol_epi32_512(<8 x i64> %a, <8 x i64> %b) {
+; AVX512-LABEL: combine_var_rol_epi32_512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %a0 = bitcast <8 x i64> %a to <16 x i32>
+ %b0 = bitcast <8 x i64> %b to <16 x i32>
+ %1 = and <16 x i32> %b0,
+ %2 = sub nsw <16 x i32> , %1
+ %3 = shl <16 x i32> %a0, %1
+ %4 = icmp ult <16 x i32> %2,
+ %5 = select <16 x i1> %4, <16 x i32> %a0, <16 x i32> zeroinitializer
+ %6 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> zeroinitializer
+ %7 = lshr <16 x i32> %5, %6
+ %8 = or <16 x i32> %7, %3
+ %bc = bitcast <16 x i32> %8 to <8 x i64>
+ ret <8 x i64> %bc
+}
+
+define <2 x i64> @combine_var_ror_epi32_128(<2 x i64> %a, <2 x i64> %b) {
+; AVX512-LABEL: combine_var_ror_epi32_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprorvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %a0 = bitcast <2 x i64> %a to <4 x i32>
+ %b0 = bitcast <2 x i64> %b to <4 x i32>
+ %1 = and <4 x i32> %b0,
+ %2 = sub nsw <4 x i32> , %1
+ %3 = lshr <4 x i32> %a0, %1
+ %4 = icmp ult <4 x i32> %2,
+ %5 = select <4 x i1> %4, <4 x i32> %a0, <4 x i32> zeroinitializer
+ %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer
+ %7 = shl <4 x i32> %5, %6
+ %8 = or <4 x i32> %7, %3
+ %bc = bitcast <4 x i32> %8 to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <4 x i64> @combine_var_ror_epi32_256(<4 x i64> %a, <4 x i64> %b) {
+; AVX512-LABEL: combine_var_ror_epi32_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprorvd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %a0 = bitcast <4 x i64> %a to <8 x i32>
+ %b0 = bitcast <4 x i64> %b to <8 x i32>
+ %1 = and <8 x i32> %b0,
+ %2 = sub nsw <8 x i32> , %1
+ %3 = lshr <8 x i32> %a0, %1
+ %4 = icmp ult <8 x i32> %2,
+ %5 = select <8 x i1> %4, <8 x i32> %a0, <8 x i32> zeroinitializer
+ %6 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> zeroinitializer
+ %7 = shl <8 x i32> %5, %6
+ %8 = or <8 x i32> %7, %3
+ %bc = bitcast <8 x i32> %8 to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <8 x i64> @combine_var_ror_epi32_512(<8 x i64> %a, <8 x i64> %b) {
+; AVX512-LABEL: combine_var_ror_epi32_512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprorvd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %a0 = bitcast <8 x i64> %a to <16 x i32>
+ %b0 = bitcast <8 x i64> %b to <16 x i32>
+ %1 = and <16 x i32> %b0,
+ %2 = sub nsw <16 x i32> , %1
+ %3 = lshr <16 x i32> %a0, %1
+ %4 = icmp ult <16 x i32> %2,
+ %5 = select <16 x i1> %4, <16 x i32> %a0, <16 x i32> zeroinitializer
+ %6 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> zeroinitializer
+ %7 = shl <16 x i32> %5, %6
+ %8 = or <16 x i32> %7, %3
+ %bc = bitcast <16 x i32> %8 to <8 x i64>
+ ret <8 x i64> %bc
+}
+
+define <2 x i64> @combine_var_rol_epi64_128(<2 x i64> %a, <2 x i64> %b) {
+; AVX512-LABEL: combine_var_rol_epi64_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprolvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <2 x i64> %b,
+ %2 = sub nsw <2 x i64> , %1
+ %3 = shl <2 x i64> %a, %1
+ %4 = icmp ult <2 x i64> %2,
+ %5 = select <2 x i1> %4, <2 x i64> %a, <2 x i64> zeroinitializer
+ %6 = select <2 x i1> %4, <2 x i64> %2, <2 x i64> zeroinitializer
+ %7 = lshr <2 x i64> %5, %6
+ %8 = or <2 x i64> %7, %3
+ ret <2 x i64> %8
+}
+
+define <4 x i64> @combine_var_rol_epi64_256(<4 x i64> %a, <4 x i64> %b) {
+; AVX512-LABEL: combine_var_rol_epi64_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprolvq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = and <4 x i64> %b,
+ %2 = sub nsw <4 x i64> , %1
+ %3 = shl <4 x i64> %a, %1
+ %4 = icmp ult <4 x i64> %2,
+ %5 = select <4 x i1> %4, <4 x i64> %a, <4 x i64> zeroinitializer
+ %6 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer
+ %7 = lshr <4 x i64> %5, %6
+ %8 = or <4 x i64> %7, %3
+ ret <4 x i64> %8
+}
+
+define <8 x i64> @combine_var_rol_epi64_512(<8 x i64> %a, <8 x i64> %b) {
+; AVX512-LABEL: combine_var_rol_epi64_512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %1 = and <8 x i64> %b,
+ %2 = sub nsw <8 x i64> , %1
+ %3 = shl <8 x i64> %a, %1
+ %4 = icmp ult <8 x i64> %2,
+ %5 = select <8 x i1> %4, <8 x i64> %a, <8 x i64> zeroinitializer
+ %6 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
+ %7 = lshr <8 x i64> %5, %6
+ %8 = or <8 x i64> %7, %3
+ ret <8 x i64> %8
+}
+
+define <2 x i64> @combine_var_ror_epi64_128(<2 x i64> %a, <2 x i64> %b) {
+; AVX512-LABEL: combine_var_ror_epi64_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprorvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <2 x i64> %b,
+ %2 = sub nsw <2 x i64> , %1
+ %3 = lshr <2 x i64> %a, %1
+ %4 = icmp ult <2 x i64> %2,
+ %5 = select <2 x i1> %4, <2 x i64> %a, <2 x i64> zeroinitializer
+ %6 = select <2 x i1> %4, <2 x i64> %2, <2 x i64> zeroinitializer
+ %7 = shl <2 x i64> %5, %6
+ %8 = or <2 x i64> %7, %3
+ ret <2 x i64> %8
+}
+
+define <4 x i64> @combine_var_ror_epi64_256(<4 x i64> %a, <4 x i64> %b) {
+; AVX512-LABEL: combine_var_ror_epi64_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprorvq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = and <4 x i64> %b,
+ %2 = sub nsw <4 x i64> , %1
+ %3 = lshr <4 x i64> %a, %1
+ %4 = icmp ult <4 x i64> %2,
+ %5 = select <4 x i1> %4, <4 x i64> %a, <4 x i64> zeroinitializer
+ %6 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer
+ %7 = shl <4 x i64> %5, %6
+ %8 = or <4 x i64> %7, %3
+ ret <4 x i64> %8
+}
+
+define <8 x i64> @combine_var_ror_epi64_512(<8 x i64> %a, <8 x i64> %b) {
+; AVX512-LABEL: combine_var_ror_epi64_512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprorvq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %1 = and <8 x i64> %b,
+ %2 = sub nsw <8 x i64> , %1
+ %3 = lshr <8 x i64> %a, %1
+ %4 = icmp ult <8 x i64> %2,
+ %5 = select <8 x i1> %4, <8 x i64> %a, <8 x i64> zeroinitializer
+ %6 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
+ %7 = shl <8 x i64> %5, %6
+ %8 = or <8 x i64> %7, %3
+ ret <8 x i64> %8
+}
+
+define <2 x i64> @combine_var_mask_rol_epi32_128(<2 x i64> %w, i8 zeroext %u, <2 x i64> %a, <2 x i64> %b) {
+; AVX512-LABEL: combine_var_mask_rol_epi32_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw %edi, %k1
+; AVX512-NEXT: vprolvd %xmm2, %xmm1, %xmm0 {%k1}
+; AVX512-NEXT: retq
+ %a0 = bitcast <2 x i64> %a to <4 x i32>
+ %b0 = bitcast <2 x i64> %b to <4 x i32>
+ %w0 = bitcast <2 x i64> %w to <4 x i32>
+ %1 = and <4 x i32> %b0,
+ %2 = sub nsw <4 x i32> , %1
+ %3 = shl <4 x i32> %a0, %1
+ %4 = icmp ult <4 x i32> %2,
+ %5 = select <4 x i1> %4, <4 x i32> %a0, <4 x i32> zeroinitializer
+ %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer
+ %7 = lshr <4 x i32> %5, %6
+ %8 = or <4 x i32> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <4 x i32>
+ %11 = select <4 x i1> %10, <4 x i32> %8, <4 x i32> %w0
+ %bc = bitcast <4 x i32> %11 to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <4 x i64> @combine_var_mask_rol_epi32_256(<4 x i64> %w, i8 zeroext %u, <4 x i64> %a, <4 x i64> %b) {
+; AVX512-LABEL: combine_var_mask_rol_epi32_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw %edi, %k1
+; AVX512-NEXT: vprolvd %ymm2, %ymm1, %ymm0 {%k1}
+; AVX512-NEXT: retq
+ %a0 = bitcast <4 x i64> %a to <8 x i32>
+ %b0 = bitcast <4 x i64> %b to <8 x i32>
+ %w0 = bitcast <4 x i64> %w to <8 x i32>
+ %1 = and <8 x i32> %b0,
+ %2 = sub nsw <8 x i32> , %1
+ %3 = shl <8 x i32> %a0, %1
+ %4 = icmp ult <8 x i32> %2,
+ %5 = select <8 x i1> %4, <8 x i32> %a0, <8 x i32> zeroinitializer
+ %6 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> zeroinitializer
+ %7 = lshr <8 x i32> %5, %6
+ %8 = or <8 x i32> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = select <8 x i1> %9, <8 x i32> %8, <8 x i32> %w0
+ %bc = bitcast <8 x i32> %10 to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <8 x i64> @combine_var_mask_rol_epi32_512(<8 x i64> %w, i16 zeroext %u, <8 x i64> %a, <8 x i64> %b) {
+; AVX512-LABEL: combine_var_mask_rol_epi32_512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw %edi, %k1
+; AVX512-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: retq
+ %a0 = bitcast <8 x i64> %a to <16 x i32>
+ %b0 = bitcast <8 x i64> %b to <16 x i32>
+ %w0 = bitcast <8 x i64> %w to <16 x i32>
+ %1 = and <16 x i32> %b0,
+ %2 = sub nsw <16 x i32> , %1
+ %3 = shl <16 x i32> %a0, %1
+ %4 = icmp ult <16 x i32> %2,
+ %5 = select <16 x i1> %4, <16 x i32> %a0, <16 x i32> zeroinitializer
+ %6 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> zeroinitializer
+ %7 = lshr <16 x i32> %5, %6
+ %8 = or <16 x i32> %7, %3
+ %9 = bitcast i16 %u to <16 x i1>
+ %10 = select <16 x i1> %9, <16 x i32> %8, <16 x i32> %w0
+ %bc = bitcast <16 x i32> %10 to <8 x i64>
+ ret <8 x i64> %bc
+}
+
+define <2 x i64> @combine_var_mask_ror_epi32_128(<2 x i64> %w, i8 zeroext %u, <2 x i64> %a, <2 x i64> %b) {
+; AVX512-LABEL: combine_var_mask_ror_epi32_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw %edi, %k1
+; AVX512-NEXT: vprorvd %xmm2, %xmm1, %xmm0 {%k1}
+; AVX512-NEXT: retq
+ %a0 = bitcast <2 x i64> %a to <4 x i32>
+ %b0 = bitcast <2 x i64> %b to <4 x i32>
+ %w0 = bitcast <2 x i64> %w to <4 x i32>
+ %1 = and <4 x i32> %b0,
+ %2 = sub nsw <4 x i32> , %1
+ %3 = lshr <4 x i32> %a0, %1
+ %4 = icmp ult <4 x i32> %2,
+ %5 = select <4 x i1> %4, <4 x i32> %a0, <4 x i32> zeroinitializer
+ %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer
+ %7 = shl <4 x i32> %5, %6
+ %8 = or <4 x i32> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <4 x i32>
+ %11 = select <4 x i1> %10, <4 x i32> %8, <4 x i32> %w0
+ %bc = bitcast <4 x i32> %11 to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <4 x i64> @combine_var_mask_ror_epi32_256(<4 x i64> %w, i8 zeroext %u, <4 x i64> %a, <4 x i64> %b) {
+; AVX512-LABEL: combine_var_mask_ror_epi32_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw %edi, %k1
+; AVX512-NEXT: vprorvd %ymm2, %ymm1, %ymm0 {%k1}
+; AVX512-NEXT: retq
+ %a0 = bitcast <4 x i64> %a to <8 x i32>
+ %b0 = bitcast <4 x i64> %b to <8 x i32>
+ %w0 = bitcast <4 x i64> %w to <8 x i32>
+ %1 = and <8 x i32> %b0,
+ %2 = sub nsw <8 x i32> , %1
+ %3 = lshr <8 x i32> %a0, %1
+ %4 = icmp ult <8 x i32> %2,
+ %5 = select <8 x i1> %4, <8 x i32> %a0, <8 x i32> zeroinitializer
+ %6 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> zeroinitializer
+ %7 = shl <8 x i32> %5, %6
+ %8 = or <8 x i32> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = select <8 x i1> %9, <8 x i32> %8, <8 x i32> %w0
+ %bc = bitcast <8 x i32> %10 to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <8 x i64> @combine_var_mask_ror_epi32_512(<8 x i64> %w, i16 zeroext %u, <8 x i64> %a, <8 x i64> %b) {
+; AVX512-LABEL: combine_var_mask_ror_epi32_512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw %edi, %k1
+; AVX512-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: retq
+ %a0 = bitcast <8 x i64> %a to <16 x i32>
+ %b0 = bitcast <8 x i64> %b to <16 x i32>
+ %w0 = bitcast <8 x i64> %w to <16 x i32>
+ %1 = and <16 x i32> %b0,
+ %2 = sub nsw <16 x i32> , %1
+ %3 = lshr <16 x i32> %a0, %1
+ %4 = icmp ult <16 x i32> %2,
+ %5 = select <16 x i1> %4, <16 x i32> %a0, <16 x i32> zeroinitializer
+ %6 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> zeroinitializer
+ %7 = shl <16 x i32> %5, %6
+ %8 = or <16 x i32> %7, %3
+ %9 = bitcast i16 %u to <16 x i1>
+ %10 = select <16 x i1> %9, <16 x i32> %8, <16 x i32> %w0
+ %bc = bitcast <16 x i32> %10 to <8 x i64>
+ ret <8 x i64> %bc
+}
+
+define <2 x i64> @combine_var_mask_rol_epi64_128(<2 x i64> %w, i8 zeroext %u, <2 x i64> %a, <2 x i64> %b) {
+; AVX512-LABEL: combine_var_mask_rol_epi64_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprolvq %xmm2, %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <2 x i64> %b,
+ %2 = sub nsw <2 x i64> , %1
+ %3 = shl <2 x i64> %a, %1
+ %4 = icmp ult <2 x i64> %2,
+ %5 = select <2 x i1> %4, <2 x i64> %a, <2 x i64> zeroinitializer
+ %6 = select <2 x i1> %4, <2 x i64> %2, <2 x i64> zeroinitializer
+ %7 = lshr <2 x i64> %5, %6
+ %8 = or <2 x i64> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <2 x i32>
+ %11 = select <2 x i1> %10, <2 x i64> %8, <2 x i64> %w
+ ret <2 x i64> %8
+}
+
+define <4 x i64> @combine_var_mask_rol_epi64_256(<4 x i64> %w, i8 zeroext %u, <4 x i64> %a, <4 x i64> %b) {
+; AVX512-LABEL: combine_var_mask_rol_epi64_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprolvq %ymm2, %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %1 = and <4 x i64> %b,
+ %2 = sub nsw <4 x i64> , %1
+ %3 = shl <4 x i64> %a, %1
+ %4 = icmp ult <4 x i64> %2,
+ %5 = select <4 x i1> %4, <4 x i64> %a, <4 x i64> zeroinitializer
+ %6 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer
+ %7 = lshr <4 x i64> %5, %6
+ %8 = or <4 x i64> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <4 x i32>
+ %11 = select <4 x i1> %10, <4 x i64> %8, <4 x i64> %w
+ ret <4 x i64> %8
+}
+
+define <8 x i64> @combine_var_mask_rol_epi64_512(<8 x i64> %w, i8 zeroext %u, <8 x i64> %a, <8 x i64> %b) {
+; AVX512-LABEL: combine_var_mask_rol_epi64_512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprolvq %zmm2, %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %1 = and <8 x i64> %b,
+ %2 = sub nsw <8 x i64> , %1
+ %3 = shl <8 x i64> %a, %1
+ %4 = icmp ult <8 x i64> %2,
+ %5 = select <8 x i1> %4, <8 x i64> %a, <8 x i64> zeroinitializer
+ %6 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
+ %7 = lshr <8 x i64> %5, %6
+ %8 = or <8 x i64> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = select <8 x i1> %9, <8 x i64> %8, <8 x i64> %w
+ ret <8 x i64> %8
+}
+
+define <2 x i64> @combine_var_mask_ror_epi64_128(<2 x i64> %w, i8 zeroext %u, <2 x i64> %a, <2 x i64> %b) {
+; AVX512-LABEL: combine_var_mask_ror_epi64_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprorvq %xmm2, %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <2 x i64> %b,
+ %2 = sub nsw <2 x i64> , %1
+ %3 = lshr <2 x i64> %a, %1
+ %4 = icmp ult <2 x i64> %2,
+ %5 = select <2 x i1> %4, <2 x i64> %a, <2 x i64> zeroinitializer
+ %6 = select <2 x i1> %4, <2 x i64> %2, <2 x i64> zeroinitializer
+ %7 = shl <2 x i64> %5, %6
+ %8 = or <2 x i64> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <2 x i32>
+ %11 = select <2 x i1> %10, <2 x i64> %8, <2 x i64> %w
+ ret <2 x i64> %8
+}
+
+define <4 x i64> @combine_var_mask_ror_epi64_256(<4 x i64> %w, i8 zeroext %u, <4 x i64> %a, <4 x i64> %b) {
+; AVX512-LABEL: combine_var_mask_ror_epi64_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprorvq %ymm2, %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %1 = and <4 x i64> %b,
+ %2 = sub nsw <4 x i64> , %1
+ %3 = lshr <4 x i64> %a, %1
+ %4 = icmp ult <4 x i64> %2,
+ %5 = select <4 x i1> %4, <4 x i64> %a, <4 x i64> zeroinitializer
+ %6 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer
+ %7 = shl <4 x i64> %5, %6
+ %8 = or <4 x i64> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <4 x i32>
+ %11 = select <4 x i1> %10, <4 x i64> %8, <4 x i64> %w
+ ret <4 x i64> %8
+}
+
+define <8 x i64> @combine_var_mask_ror_epi64_512(<8 x i64> %w, i8 zeroext %u, <8 x i64> %a, <8 x i64> %b) {
+; AVX512-LABEL: combine_var_mask_ror_epi64_512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprorvq %zmm2, %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %1 = and <8 x i64> %b,
+ %2 = sub nsw <8 x i64> , %1
+ %3 = lshr <8 x i64> %a, %1
+ %4 = icmp ult <8 x i64> %2,
+ %5 = select <8 x i1> %4, <8 x i64> %a, <8 x i64> zeroinitializer
+ %6 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
+ %7 = shl <8 x i64> %5, %6
+ %8 = or <8 x i64> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = select <8 x i1> %9, <8 x i64> %8, <8 x i64> %w
+ ret <8 x i64> %8
+}
+
+define <2 x i64> @combine_var_maskz_rol_epi32_128(i8 zeroext %u, <2 x i64> %a, <2 x i64> %b) {
+; AVX512-LABEL: combine_var_maskz_rol_epi32_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw %edi, %k1
+; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %a0 = bitcast <2 x i64> %a to <4 x i32>
+ %b0 = bitcast <2 x i64> %b to <4 x i32>
+ %1 = and <4 x i32> %b0,
+ %2 = sub nsw <4 x i32> , %1
+ %3 = shl <4 x i32> %a0, %1
+ %4 = icmp ult <4 x i32> %2,
+ %5 = select <4 x i1> %4, <4 x i32> %a0, <4 x i32> zeroinitializer
+ %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer
+ %7 = lshr <4 x i32> %5, %6
+ %8 = or <4 x i32> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <4 x i32>
+ %11 = select <4 x i1> %10, <4 x i32> %8, <4 x i32> zeroinitializer
+ %bc = bitcast <4 x i32> %11 to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <4 x i64> @combine_var_maskz_rol_epi32_256(i8 zeroext %u, <4 x i64> %a, <4 x i64> %b) {
+; AVX512-LABEL: combine_var_maskz_rol_epi32_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw %edi, %k1
+; AVX512-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %a0 = bitcast <4 x i64> %a to <8 x i32>
+ %b0 = bitcast <4 x i64> %b to <8 x i32>
+ %1 = and <8 x i32> %b0,
+ %2 = sub nsw <8 x i32> , %1
+ %3 = shl <8 x i32> %a0, %1
+ %4 = icmp ult <8 x i32> %2,
+ %5 = select <8 x i1> %4, <8 x i32> %a0, <8 x i32> zeroinitializer
+ %6 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> zeroinitializer
+ %7 = lshr <8 x i32> %5, %6
+ %8 = or <8 x i32> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = select <8 x i1> %9, <8 x i32> %8, <8 x i32> zeroinitializer
+ %bc = bitcast <8 x i32> %10 to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <8 x i64> @combine_var_maskz_rol_epi32_512(i16 zeroext %u, <8 x i64> %a, <8 x i64> %b) {
+; AVX512-LABEL: combine_var_maskz_rol_epi32_512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw %edi, %k1
+; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %a0 = bitcast <8 x i64> %a to <16 x i32>
+ %b0 = bitcast <8 x i64> %b to <16 x i32>
+ %1 = and <16 x i32> %b0,
+ %2 = sub nsw <16 x i32> , %1
+ %3 = shl <16 x i32> %a0, %1
+ %4 = icmp ult <16 x i32> %2,
+ %5 = select <16 x i1> %4, <16 x i32> %a0, <16 x i32> zeroinitializer
+ %6 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> zeroinitializer
+ %7 = lshr <16 x i32> %5, %6
+ %8 = or <16 x i32> %7, %3
+ %9 = bitcast i16 %u to <16 x i1>
+ %10 = select <16 x i1> %9, <16 x i32> %8, <16 x i32> zeroinitializer
+ %bc = bitcast <16 x i32> %10 to <8 x i64>
+ ret <8 x i64> %bc
+}
+
+define <2 x i64> @combine_var_maskz_ror_epi32_128(i8 zeroext %u, <2 x i64> %a, <2 x i64> %b) {
+; AVX512-LABEL: combine_var_maskz_ror_epi32_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw %edi, %k1
+; AVX512-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %a0 = bitcast <2 x i64> %a to <4 x i32>
+ %b0 = bitcast <2 x i64> %b to <4 x i32>
+ %1 = and <4 x i32> %b0,
+ %2 = sub nsw <4 x i32> , %1
+ %3 = lshr <4 x i32> %a0, %1
+ %4 = icmp ult <4 x i32> %2,
+ %5 = select <4 x i1> %4, <4 x i32> %a0, <4 x i32> zeroinitializer
+ %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer
+ %7 = shl <4 x i32> %5, %6
+ %8 = or <4 x i32> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <4 x i32>
+ %11 = select <4 x i1> %10, <4 x i32> %8, <4 x i32> zeroinitializer
+ %bc = bitcast <4 x i32> %11 to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <4 x i64> @combine_var_maskz_ror_epi32_256(i8 zeroext %u, <4 x i64> %a, <4 x i64> %b) {
+; AVX512-LABEL: combine_var_maskz_ror_epi32_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw %edi, %k1
+; AVX512-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %a0 = bitcast <4 x i64> %a to <8 x i32>
+ %b0 = bitcast <4 x i64> %b to <8 x i32>
+ %1 = and <8 x i32> %b0,
+ %2 = sub nsw <8 x i32> , %1
+ %3 = lshr <8 x i32> %a0, %1
+ %4 = icmp ult <8 x i32> %2,
+ %5 = select <8 x i1> %4, <8 x i32> %a0, <8 x i32> zeroinitializer
+ %6 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> zeroinitializer
+ %7 = shl <8 x i32> %5, %6
+ %8 = or <8 x i32> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = select <8 x i1> %9, <8 x i32> %8, <8 x i32> zeroinitializer
+ %bc = bitcast <8 x i32> %10 to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <8 x i64> @combine_var_maskz_ror_epi32_512(i16 zeroext %u, <8 x i64> %a, <8 x i64> %b) {
+; AVX512-LABEL: combine_var_maskz_ror_epi32_512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw %edi, %k1
+; AVX512-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %a0 = bitcast <8 x i64> %a to <16 x i32>
+ %b0 = bitcast <8 x i64> %b to <16 x i32>
+ %1 = and <16 x i32> %b0,
+ %2 = sub nsw <16 x i32> , %1
+ %3 = lshr <16 x i32> %a0, %1
+ %4 = icmp ult <16 x i32> %2,
+ %5 = select <16 x i1> %4, <16 x i32> %a0, <16 x i32> zeroinitializer
+ %6 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> zeroinitializer
+ %7 = shl <16 x i32> %5, %6
+ %8 = or <16 x i32> %7, %3
+ %9 = bitcast i16 %u to <16 x i1>
+ %10 = select <16 x i1> %9, <16 x i32> %8, <16 x i32> zeroinitializer
+ %bc = bitcast <16 x i32> %10 to <8 x i64>
+ ret <8 x i64> %bc
+}
+
+define <2 x i64> @combine_var_maskz_rol_epi64_128(i8 zeroext %u, <2 x i64> %a, <2 x i64> %b) {
+; AVX512-LABEL: combine_var_maskz_rol_epi64_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprolvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <2 x i64> %b,
+ %2 = sub nsw <2 x i64> , %1
+ %3 = shl <2 x i64> %a, %1
+ %4 = icmp ult <2 x i64> %2,
+ %5 = select <2 x i1> %4, <2 x i64> %a, <2 x i64> zeroinitializer
+ %6 = select <2 x i1> %4, <2 x i64> %2, <2 x i64> zeroinitializer
+ %7 = lshr <2 x i64> %5, %6
+ %8 = or <2 x i64> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <2 x i32>
+ %11 = select <2 x i1> %10, <2 x i64> %8, <2 x i64> zeroinitializer
+ ret <2 x i64> %8
+}
+
+define <4 x i64> @combine_var_maskz_rol_epi64_256(i8 zeroext %u, <4 x i64> %a, <4 x i64> %b) {
+; AVX512-LABEL: combine_var_maskz_rol_epi64_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprolvq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = and <4 x i64> %b,
+ %2 = sub nsw <4 x i64> , %1
+ %3 = shl <4 x i64> %a, %1
+ %4 = icmp ult <4 x i64> %2,
+ %5 = select <4 x i1> %4, <4 x i64> %a, <4 x i64> zeroinitializer
+ %6 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer
+ %7 = lshr <4 x i64> %5, %6
+ %8 = or <4 x i64> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <4 x i32>
+ %11 = select <4 x i1> %10, <4 x i64> %8, <4 x i64> zeroinitializer
+ ret <4 x i64> %8
+}
+
+define <8 x i64> @combine_var_maskz_rol_epi64_512(i8 zeroext %u, <8 x i64> %a, <8 x i64> %b) {
+; AVX512-LABEL: combine_var_maskz_rol_epi64_512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %1 = and <8 x i64> %b,
+ %2 = sub nsw <8 x i64> , %1
+ %3 = shl <8 x i64> %a, %1
+ %4 = icmp ult <8 x i64> %2,
+ %5 = select <8 x i1> %4, <8 x i64> %a, <8 x i64> zeroinitializer
+ %6 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
+ %7 = lshr <8 x i64> %5, %6
+ %8 = or <8 x i64> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = select <8 x i1> %9, <8 x i64> %8, <8 x i64> zeroinitializer
+ ret <8 x i64> %8
+}
+
+define <2 x i64> @combine_var_maskz_ror_epi64_128(i8 zeroext %u, <2 x i64> %a, <2 x i64> %b) {
+; AVX512-LABEL: combine_var_maskz_ror_epi64_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprorvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <2 x i64> %b,
+ %2 = sub nsw <2 x i64> , %1
+ %3 = lshr <2 x i64> %a, %1
+ %4 = icmp ult <2 x i64> %2,
+ %5 = select <2 x i1> %4, <2 x i64> %a, <2 x i64> zeroinitializer
+ %6 = select <2 x i1> %4, <2 x i64> %2, <2 x i64> zeroinitializer
+ %7 = shl <2 x i64> %5, %6
+ %8 = or <2 x i64> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <2 x i32>
+ %11 = select <2 x i1> %10, <2 x i64> %8, <2 x i64> zeroinitializer
+ ret <2 x i64> %8
+}
+
+define <4 x i64> @combine_var_maskz_ror_epi64_256(i8 zeroext %u, <4 x i64> %a, <4 x i64> %b) {
+; AVX512-LABEL: combine_var_maskz_ror_epi64_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprorvq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = and <4 x i64> %b,
+ %2 = sub nsw <4 x i64> , %1
+ %3 = lshr <4 x i64> %a, %1
+ %4 = icmp ult <4 x i64> %2,
+ %5 = select <4 x i1> %4, <4 x i64> %a, <4 x i64> zeroinitializer
+ %6 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer
+ %7 = shl <4 x i64> %5, %6
+ %8 = or <4 x i64> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <4 x i32>
+ %11 = select <4 x i1> %10, <4 x i64> %8, <4 x i64> zeroinitializer
+ ret <4 x i64> %8
+}
+
+define <8 x i64> @combine_var_maskz_ror_epi64_512(i8 zeroext %u, <8 x i64> %a, <8 x i64> %b) {
+; AVX512-LABEL: combine_var_maskz_ror_epi64_512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vprorvq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %1 = and <8 x i64> %b,
+ %2 = sub nsw <8 x i64> , %1
+ %3 = lshr <8 x i64> %a, %1
+ %4 = icmp ult <8 x i64> %2,
+ %5 = select <8 x i1> %4, <8 x i64> %a, <8 x i64> zeroinitializer
+ %6 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
+ %7 = shl <8 x i64> %5, %6
+ %8 = or <8 x i64> %7, %3
+ %9 = bitcast i8 %u to <8 x i1>
+ %10 = select <8 x i1> %9, <8 x i64> %8, <8 x i64> zeroinitializer
+ ret <8 x i64> %8
+}
Index: test/Transforms/InstCombine/X86/x86-rotates.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/X86/x86-rotates.ll
@@ -0,0 +1,412 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define <4 x i32> @avx512_mask_prol_d_128(<4 x i32> %v, <4 x i32> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_prol_d_128(
+; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[V:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[V]],
+; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <4 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP3]], <4 x i32> [[SRC:%.*]]
+; CHECK-NEXT: ret <4 x i32> [[TMP5]]
+;
+ %1 = tail call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %v, i32 5, <4 x i32> %src, i8 %mask)
+ ret <4 x i32> %1
+}
+
+define <4 x i32> @avx512_mask_pror_d_128(<4 x i32> %v, <4 x i32> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_pror_d_128(
+; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[V:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[V]],
+; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <4 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP3]], <4 x i32> [[SRC:%.*]]
+; CHECK-NEXT: ret <4 x i32> [[TMP5]]
+;
+ %1 = tail call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %v, i32 5, <4 x i32> %src, i8 %mask)
+ ret <4 x i32> %1
+}
+
+define <8 x i32> @avx512_mask_prol_d_256(<8 x i32> %v, <8 x i32> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_prol_d_256(
+; CHECK-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[V:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[V]],
+; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP3]], <8 x i32> [[SRC:%.*]]
+; CHECK-NEXT: ret <8 x i32> [[TMP5]]
+;
+ %1 = tail call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %v, i32 5, <8 x i32> %src, i8 %mask)
+ ret <8 x i32> %1
+}
+
+define <8 x i32> @avx512_mask_pror_d_256(<8 x i32> %v, <8 x i32> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_pror_d_256(
+; CHECK-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[V:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[V]],
+; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP3]], <8 x i32> [[SRC:%.*]]
+; CHECK-NEXT: ret <8 x i32> [[TMP5]]
+;
+ %1 = tail call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %v, i32 5, <8 x i32> %src, i8 %mask)
+ ret <8 x i32> %1
+}
+
+define <16 x i32> @avx512_mask_prol_d_512(<16 x i32> %v, <16 x i32> %src, i16 %mask) {
+; CHECK-LABEL: @avx512_mask_prol_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = shl <16 x i32> [[V:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = lshr <16 x i32> [[V]],
+; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP3]], <16 x i32> [[SRC:%.*]]
+; CHECK-NEXT: ret <16 x i32> [[TMP5]]
+;
+ %1 = tail call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %v, i32 5, <16 x i32> %src, i16 %mask)
+ ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_mask_pror_d_512(<16 x i32> %v, <16 x i32> %src, i16 %mask) {
+; CHECK-LABEL: @avx512_mask_pror_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = shl <16 x i32> [[V:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = lshr <16 x i32> [[V]],
+; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP3]], <16 x i32> [[SRC:%.*]]
+; CHECK-NEXT: ret <16 x i32> [[TMP5]]
+;
+ %1 = tail call <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32> %v, i32 5, <16 x i32> %src, i16 %mask)
+ ret <16 x i32> %1
+}
+
+define <2 x i64> @avx512_mask_prol_q_128(<2 x i64> %v, <2 x i64> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_prol_q_128(
+; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i64> [[V:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[V]],
+; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <2 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP3]], <2 x i64> [[SRC:%.*]]
+; CHECK-NEXT: ret <2 x i64> [[TMP5]]
+;
+ %1 = tail call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %v, i32 5, <2 x i64> %src, i8 %mask)
+ ret <2 x i64> %1
+}
+
+define <2 x i64> @avx512_mask_pror_q_128(<2 x i64> %v, <2 x i64> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_pror_q_128(
+; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i64> [[V:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[V]],
+; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <2 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP3]], <2 x i64> [[SRC:%.*]]
+; CHECK-NEXT: ret <2 x i64> [[TMP5]]
+;
+ %1 = tail call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %v, i32 5, <2 x i64> %src, i8 %mask)
+ ret <2 x i64> %1
+}
+
+define <4 x i64> @avx512_mask_prol_q_256(<4 x i64> %v, <4 x i64> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_prol_q_256(
+; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i64> [[V:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i64> [[V]],
+; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <4 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP3]], <4 x i64> [[SRC:%.*]]
+; CHECK-NEXT: ret <4 x i64> [[TMP5]]
+;
+ %1 = tail call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %v, i32 5, <4 x i64> %src, i8 %mask)
+ ret <4 x i64> %1
+}
+
+define <4 x i64> @avx512_mask_pror_q_256(<4 x i64> %v, <4 x i64> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_pror_q_256(
+; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i64> [[V:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i64> [[V]],
+; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <4 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP3]], <4 x i64> [[SRC:%.*]]
+; CHECK-NEXT: ret <4 x i64> [[TMP5]]
+;
+ %1 = tail call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %v, i32 5, <4 x i64> %src, i8 %mask)
+ ret <4 x i64> %1
+}
+
+define <8 x i64> @avx512_mask_prol_q_512(<8 x i64> %v, <8 x i64> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_prol_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = shl <8 x i64> [[V:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i64> [[V]],
+; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP3]], <8 x i64> [[SRC:%.*]]
+; CHECK-NEXT: ret <8 x i64> [[TMP5]]
+;
+ %1 = tail call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %v, i32 5, <8 x i64> %src, i8 %mask)
+ ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_mask_pror_q_512(<8 x i64> %v, <8 x i64> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_pror_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = shl <8 x i64> [[V:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i64> [[V]],
+; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP3]], <8 x i64> [[SRC:%.*]]
+; CHECK-NEXT: ret <8 x i64> [[TMP5]]
+;
+ %1 = tail call <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64> %v, i32 5, <8 x i64> %src, i8 %mask)
+ ret <8 x i64> %1
+}
+
+define <4 x i32> @avx512_mask_prolv_d_128(<4 x i32> %v, <4 x i32> %count, <4 x i32> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_prolv_d_128(
+; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[COUNT:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> , [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[V:%.*]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <4 x i32> [[TMP2]],
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[V]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = lshr <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP3]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> undef, <4 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP8]], <4 x i32> [[SRC:%.*]]
+; CHECK-NEXT: ret <4 x i32> [[TMP10]]
+;
+ %1 = tail call <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32> %v, <4 x i32> %count, <4 x i32> %src, i8 %mask)
+ ret <4 x i32> %1
+}
+
+define <4 x i32> @avx512_mask_prorv_d_128(<4 x i32> %v, <4 x i32> %count, <4 x i32> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_prorv_d_128(
+; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[COUNT:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> , [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP2]],
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[V:%.*]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = shl <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = lshr <4 x i32> [[V]], [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> undef, <4 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP8]], <4 x i32> [[SRC:%.*]]
+; CHECK-NEXT: ret <4 x i32> [[TMP10]]
+;
+ %1 = tail call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %v, <4 x i32> %count, <4 x i32> %src, i8 %mask)
+ ret <4 x i32> %1
+}
+
+define <8 x i32> @avx512_mask_prolv_d_256(<8 x i32> %v, <8 x i32> %count, <8 x i32> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_prolv_d_256(
+; CHECK-NEXT: [[TMP1:%.*]] = and <8 x i32> [[COUNT:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <8 x i32> , [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = shl <8 x i32> [[V:%.*]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <8 x i32> [[TMP2]],
+; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[V]], <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP2]], <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = lshr <8 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP3]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> [[TMP8]], <8 x i32> [[SRC:%.*]]
+; CHECK-NEXT: ret <8 x i32> [[TMP10]]
+;
+ %1 = tail call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %v, <8 x i32> %count, <8 x i32> %src, i8 %mask)
+ ret <8 x i32> %1
+}
+
+define <8 x i32> @avx512_mask_prorv_d_256(<8 x i32> %v, <8 x i32> %count, <8 x i32> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_prorv_d_256(
+; CHECK-NEXT: [[TMP1:%.*]] = and <8 x i32> [[COUNT:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <8 x i32> , [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <8 x i32> [[TMP2]],
+; CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[V:%.*]], <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[TMP2]], <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = shl <8 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = lshr <8 x i32> [[V]], [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> [[TMP8]], <8 x i32> [[SRC:%.*]]
+; CHECK-NEXT: ret <8 x i32> [[TMP10]]
+;
+ %1 = tail call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %v, <8 x i32> %count, <8 x i32> %src, i8 %mask)
+ ret <8 x i32> %1
+}
+
+define <16 x i32> @avx512_mask_prolv_d_512(<16 x i32> %v, <16 x i32> %count, <16 x i32> %src, i16 %mask) {
+; CHECK-LABEL: @avx512_mask_prolv_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i32> [[COUNT:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <16 x i32> , [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i32> [[V:%.*]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <16 x i32> [[TMP2]],
+; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[V]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = lshr <16 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP3]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP8]], <16 x i32> [[SRC:%.*]]
+; CHECK-NEXT: ret <16 x i32> [[TMP10]]
+;
+ %1 = tail call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %v, <16 x i32> %count, <16 x i32> %src, i16 %mask)
+ ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_mask_prorv_d_512(<16 x i32> %v, <16 x i32> %count, <16 x i32> %src, i16 %mask) {
+; CHECK-LABEL: @avx512_mask_prorv_d_512(
+; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i32> [[COUNT:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <16 x i32> , [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <16 x i32> [[TMP2]],
+; CHECK-NEXT: [[TMP4:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[V:%.*]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = shl <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = lshr <16 x i32> [[V]], [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP8]], <16 x i32> [[SRC:%.*]]
+; CHECK-NEXT: ret <16 x i32> [[TMP10]]
+;
+ %1 = tail call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %v, <16 x i32> %count, <16 x i32> %src, i16 %mask)
+ ret <16 x i32> %1
+}
+
+define <2 x i64> @avx512_mask_prolv_q_128(<2 x i64> %v, <2 x i64> %count, <2 x i64> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_prolv_q_128(
+; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[COUNT:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i64> , [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = shl <2 x i64> [[V:%.*]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <2 x i64> [[TMP2]],
+; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i64> [[V]], <2 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP4]], <2 x i64> [[TMP2]], <2 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = lshr <2 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = or <2 x i64> [[TMP3]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> undef, <2 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP8]], <2 x i64> [[SRC:%.*]]
+; CHECK-NEXT: ret <2 x i64> [[TMP10]]
+;
+ %1 = tail call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %v, <2 x i64> %count, <2 x i64> %src, i8 %mask)
+ ret <2 x i64> %1
+}
+
+define <2 x i64> @avx512_mask_prorv_q_128(<2 x i64> %v, <2 x i64> %count, <2 x i64> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_prorv_q_128(
+; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[COUNT:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i64> , [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <2 x i64> [[TMP2]],
+; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> [[TMP3]], <2 x i64> [[V:%.*]], <2 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP3]], <2 x i64> [[TMP2]], <2 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = shl <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = lshr <2 x i64> [[V]], [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = or <2 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> undef, <2 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP8]], <2 x i64> [[SRC:%.*]]
+; CHECK-NEXT: ret <2 x i64> [[TMP10]]
+;
+ %1 = tail call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %v, <2 x i64> %count, <2 x i64> %src, i8 %mask)
+ ret <2 x i64> %1
+}
+
+define <4 x i64> @avx512_mask_prolv_q_256(<4 x i64> %v, <4 x i64> %count, <4 x i64> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_prolv_q_256(
+; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i64> [[COUNT:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i64> , [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i64> [[V:%.*]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <4 x i64> [[TMP2]],
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[V]], <4 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP2]], <4 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = lshr <4 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i64> [[TMP3]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> undef, <4 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP8]], <4 x i64> [[SRC:%.*]]
+; CHECK-NEXT: ret <4 x i64> [[TMP10]]
+;
+ %1 = tail call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %v, <4 x i64> %count, <4 x i64> %src, i8 %mask)
+ ret <4 x i64> %1
+}
+
+define <4 x i64> @avx512_mask_prorv_q_256(<4 x i64> %v, <4 x i64> %count, <4 x i64> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_prorv_q_256(
+; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i64> [[COUNT:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i64> , [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i64> [[TMP2]],
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i64> [[V:%.*]], <4 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP3]], <4 x i64> [[TMP2]], <4 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = lshr <4 x i64> [[V]], [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> undef, <4 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP8]], <4 x i64> [[SRC:%.*]]
+; CHECK-NEXT: ret <4 x i64> [[TMP10]]
+;
+ %1 = tail call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %v, <4 x i64> %count, <4 x i64> %src, i8 %mask)
+ ret <4 x i64> %1
+}
+
+define <8 x i64> @avx512_mask_prolv_q_512(<8 x i64> %v, <8 x i64> %count, <8 x i64> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_prolv_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = and <8 x i64> [[COUNT:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <8 x i64> , [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = shl <8 x i64> [[V:%.*]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <8 x i64> [[TMP2]],
+; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[V]], <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP2]], <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = lshr <8 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP3]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP8]], <8 x i64> [[SRC:%.*]]
+; CHECK-NEXT: ret <8 x i64> [[TMP10]]
+;
+ %1 = tail call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %v, <8 x i64> %count, <8 x i64> %src, i8 %mask)
+ ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_mask_prorv_q_512(<8 x i64> %v, <8 x i64> %count, <8 x i64> %src, i8 %mask) {
+; CHECK-LABEL: @avx512_mask_prorv_q_512(
+; CHECK-NEXT: [[TMP1:%.*]] = and <8 x i64> [[COUNT:%.*]],
+; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <8 x i64> , [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <8 x i64> [[TMP2]],
+; CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> [[V:%.*]], <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> [[TMP2]], <8 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = shl <8 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = lshr <8 x i64> [[V]], [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP8]], <8 x i64> [[SRC:%.*]]
+; CHECK-NEXT: ret <8 x i64> [[TMP10]]
+;
+ %1 = tail call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %v, <8 x i64> %count, <8 x i64> %src, i8 %mask)
+ ret <8 x i64> %1
+}
+
+
+declare <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32>, i32, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32>, i32, <4 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32>, i32, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32>, i32, <8 x i32>, i8)
+declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i32, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32>, i32, <16 x i32>, i16)
+declare <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64>, i32, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64>, i32, <2 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64>, i32, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64>, i32, <4 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64>, i32, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64>, i32, <8 x i64>, i8)
+declare <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+declare <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)