Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1687,6 +1687,8 @@ setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::ROTL); + setTargetDAGCombine(ISD::ROTR); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::ADD); @@ -33436,6 +33438,26 @@ return SDValue(); } +static SDValue combineRotate(SDNode* N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc DL(N); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + + // fold (ROT (X, AND (Y, ESIB-1)) -> (ROT (X, Y)) + // where ESIB is EltSizeInBits + // as X86 machine instructions take care of doing Y % ESIB themselves + if (N1.getOpcode() == ISD::AND) + if (ConstantSDNode *Const = isConstOrConstSplat(N1.getOperand(1))) + if (Const->getZExtValue() == (uint64_t)(EltSizeInBits - 1)) + return DAG.getNode(N->getOpcode(), DL, VT, N0, N1.getOperand(0)); + + return SDValue(); +} + static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -38645,6 +38667,8 @@ case ISD::SHL: case ISD::SRA: case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget); + case ISD::ROTL: + case ISD::ROTR: return combineRotate(N, DAG, DCI, Subtarget); case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget); case ISD::OR: return combineOr(N, DAG, DCI, Subtarget); case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget); Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1206,6 +1206,99 @@ return nullptr; } +static Value *simplifyX86Rotate(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + StringRef Name = Intrinsic::getName(II.getIntrinsicID()); + if (!Name.consume_front(StringRef("llvm.x86.avx512.mask.pro"))) + return nullptr; + + bool IsRotLeft = Name[0] == 'l'; + bool IsVariable = Name[1] == 'v'; + bool IsEltDW = IsVariable ? Name[3] == 'd' : Name[2] == 'd'; + + Intrinsic::ID SLLID; + Intrinsic::ID SRLID; + if (Name.endswith(".128")) { + if (IsEltDW) { + SLLID = IsVariable ? Intrinsic::x86_avx2_psllv_d : + Intrinsic::x86_sse2_pslli_d; + SRLID = IsVariable ? Intrinsic::x86_avx2_psrlv_d : + Intrinsic::x86_sse2_psrli_d; + } else { + SLLID = IsVariable ? Intrinsic::x86_avx2_psllv_q : + Intrinsic::x86_sse2_pslli_q; + SRLID = IsVariable ? Intrinsic::x86_avx2_psrlv_q : + Intrinsic::x86_sse2_psrli_q; + } + } + else if (Name.endswith(".256")) { + if (IsEltDW) { + SLLID = IsVariable ? Intrinsic::x86_avx2_psllv_d_256 : + Intrinsic::x86_avx2_pslli_d; + SRLID = IsVariable ? Intrinsic::x86_avx2_psrlv_d_256 : + Intrinsic::x86_avx2_psrli_d; + } else { + SLLID = IsVariable ? Intrinsic::x86_avx2_psllv_q_256 : + Intrinsic::x86_avx2_pslli_q; + SRLID = IsVariable ? Intrinsic::x86_avx2_psrlv_q_256 : + Intrinsic::x86_avx2_psrli_q; + } + } + else { + if (IsEltDW) { + SLLID = IsVariable ? Intrinsic::x86_avx512_psllv_d_512 : + Intrinsic::x86_avx512_pslli_d_512; + SRLID = IsVariable ? Intrinsic::x86_avx512_psrlv_d_512 : + Intrinsic::x86_avx512_psrli_d_512; + } else { + SLLID = IsVariable ? Intrinsic::x86_avx512_psllv_q_512 : + Intrinsic::x86_avx512_pslli_q_512; + SRLID = IsVariable ? Intrinsic::x86_avx512_psrlv_q_512 : + Intrinsic::x86_avx512_psrli_q_512; + } + } + + Function *F = II.getCalledFunction(); + Function *SHL = Intrinsic::getDeclaration(F->getParent(), SLLID); + Function *SHR = Intrinsic::getDeclaration(F->getParent(), SRLID); + Value *A = II.getArgOperand(0); + Value *Count = II.getArgOperand(1); + Value *Src = II.getOperand(2); + Value *Mask = II.getOperand(3); + Value *Sub = nullptr; + + LLVMContext &C = II.getContext(); + unsigned EltSize = A->getType()->getScalarSizeInBits(); + Type *IntType; + if (IsVariable) + IntType = EltSize == 32 ? Type::getInt32Ty(C) : + Type::getInt64Ty(C); + else + IntType = Type::getInt32Ty(C); + Value *Max = EltSize == 32 ? ConstantInt::get(IntType, 32) : + ConstantInt::get(IntType, 64); + + // Count = Count % Max + // Sub = Max - Count + if (IsVariable) { + unsigned NumElts = Count->getType()->getVectorNumElements(); + Value *Vec = Builder.CreateVectorSplat(NumElts, Max); + Count = Builder.CreateURem(Count, Vec); + Sub = Builder.CreateSub(Vec, Count); + } else { + Count = Builder.CreateURem(Count, Max); + Sub = Builder.CreateSub(Max, Count); + } + + // Rol is defined as or (shl (A, Count%Max), shr (A, Max - Count%Max)) + Value *LeftShift = IsRotLeft ? Builder.CreateCall(SHL, { A, Count }) : + Builder.CreateCall(SHL, { A, Sub }); + Value *RightShift = IsRotLeft ? Builder.CreateCall(SHR, { A, Sub }) : + Builder.CreateCall(SHR, { A, Count }); + Value *Or = Builder.CreateOr(LeftShift, RightShift); + return emitX86MaskSelect(Mask, Or, Src, Builder); +} + static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { assert((II.getIntrinsicID() == Intrinsic::cttz || II.getIntrinsicID() == Intrinsic::ctlz) && @@ -2424,6 +2517,34 @@ break; } + case Intrinsic::x86_avx512_mask_prol_d_128: + case Intrinsic::x86_avx512_mask_prol_d_256: + case Intrinsic::x86_avx512_mask_prol_d_512: + case Intrinsic::x86_avx512_mask_prol_q_128: + case Intrinsic::x86_avx512_mask_prol_q_256: + case Intrinsic::x86_avx512_mask_prol_q_512: + case Intrinsic::x86_avx512_mask_prolv_d_128: + case Intrinsic::x86_avx512_mask_prolv_d_256: + case Intrinsic::x86_avx512_mask_prolv_d_512: + case Intrinsic::x86_avx512_mask_prolv_q_128: + case Intrinsic::x86_avx512_mask_prolv_q_256: + case Intrinsic::x86_avx512_mask_prolv_q_512: + case Intrinsic::x86_avx512_mask_pror_d_128: + case Intrinsic::x86_avx512_mask_pror_d_256: + case Intrinsic::x86_avx512_mask_pror_d_512: + case Intrinsic::x86_avx512_mask_pror_q_128: + case Intrinsic::x86_avx512_mask_pror_q_256: + case Intrinsic::x86_avx512_mask_pror_q_512: + case Intrinsic::x86_avx512_mask_prorv_d_128: + case Intrinsic::x86_avx512_mask_prorv_d_256: + case Intrinsic::x86_avx512_mask_prorv_d_512: + case Intrinsic::x86_avx512_mask_prorv_q_128: + case Intrinsic::x86_avx512_mask_prorv_q_256: + case Intrinsic::x86_avx512_mask_prorv_q_512: + if (Value *V = simplifyX86Rotate(*II, Builder)) + return replaceInstUsesWith(*II, V); + break; + // Constant fold ashr( , Ci ). // Constant fold lshr( , Ci ). // Constant fold shl( , Ci ). Index: test/CodeGen/X86/combine-rotates.ll =================================================================== --- test/CodeGen/X86/combine-rotates.ll +++ test/CodeGen/X86/combine-rotates.ll @@ -57,3 +57,712 @@ %6 = or <4 x i32> %4, %5 ret <4 x i32> %6 } + + +define <2 x i64> @combine_var_rol_epi32_128(<2 x i64> %a, <2 x i64> %b) { +; AVX512-LABEL: combine_var_rol_epi32_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %a0 = bitcast <2 x i64> %a to <4 x i32> + %b0 = bitcast <2 x i64> %b to <4 x i32> + %1 = and <4 x i32> %b0, + %2 = sub nsw <4 x i32> , %1 + %3 = shl <4 x i32> %a0, %1 + %4 = icmp ult <4 x i32> %2, + %5 = select <4 x i1> %4, <4 x i32> %a0, <4 x i32> zeroinitializer + %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer + %7 = lshr <4 x i32> %5, %6 + %8 = or <4 x i32> %7, %3 + %bc = bitcast <4 x i32> %8 to <2 x i64> + ret <2 x i64> %bc +} + +define <4 x i64> @combine_var_rol_epi32_256(<4 x i64> %a, <4 x i64> %b) { +; AVX512-LABEL: combine_var_rol_epi32_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vprolvd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %a0 = bitcast <4 x i64> %a to <8 x i32> + %b0 = bitcast <4 x i64> %b to <8 x i32> + %1 = and <8 x i32> %b0, + %2 = sub nsw <8 x i32> , %1 + %3 = shl <8 x i32> %a0, %1 + %4 = icmp ult <8 x i32> %2, + %5 = select <8 x i1> %4, <8 x i32> %a0, <8 x i32> zeroinitializer + %6 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> zeroinitializer + %7 = lshr <8 x i32> %5, %6 + %8 = or <8 x i32> %7, %3 + %bc = bitcast <8 x i32> %8 to <4 x i64> + ret <4 x i64> %bc +} + +define <8 x i64> @combine_var_rol_epi32_512(<8 x i64> %a, <8 x i64> %b) { +; AVX512-LABEL: combine_var_rol_epi32_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %a0 = bitcast <8 x i64> %a to <16 x i32> + %b0 = bitcast <8 x i64> %b to <16 x i32> + %1 = and <16 x i32> %b0, + %2 = sub nsw <16 x i32> , %1 + %3 = shl <16 x i32> %a0, %1 + %4 = icmp ult <16 x i32> %2, + %5 = select <16 x i1> %4, <16 x i32> %a0, <16 x i32> zeroinitializer + %6 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> zeroinitializer + %7 = lshr <16 x i32> %5, %6 + %8 = or <16 x i32> %7, %3 + %bc = bitcast <16 x i32> %8 to <8 x i64> + ret <8 x i64> %bc +} + +define <2 x i64> @combine_var_ror_epi32_128(<2 x i64> %a, <2 x i64> %b) { +; AVX512-LABEL: combine_var_ror_epi32_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vprorvd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %a0 = bitcast <2 x i64> %a to <4 x i32> + %b0 = bitcast <2 x i64> %b to <4 x i32> + %1 = and <4 x i32> %b0, + %2 = sub nsw <4 x i32> , %1 + %3 = lshr <4 x i32> %a0, %1 + %4 = icmp ult <4 x i32> %2, + %5 = select <4 x i1> %4, <4 x i32> %a0, <4 x i32> zeroinitializer + %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer + %7 = shl <4 x i32> %5, %6 + %8 = or <4 x i32> %7, %3 + %bc = bitcast <4 x i32> %8 to <2 x i64> + ret <2 x i64> %bc +} + +define <4 x i64> @combine_var_ror_epi32_256(<4 x i64> %a, <4 x i64> %b) { +; AVX512-LABEL: combine_var_ror_epi32_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vprorvd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %a0 = bitcast <4 x i64> %a to <8 x i32> + %b0 = bitcast <4 x i64> %b to <8 x i32> + %1 = and <8 x i32> %b0, + %2 = sub nsw <8 x i32> , %1 + %3 = lshr <8 x i32> %a0, %1 + %4 = icmp ult <8 x i32> %2, + %5 = select <8 x i1> %4, <8 x i32> %a0, <8 x i32> zeroinitializer + %6 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> zeroinitializer + %7 = shl <8 x i32> %5, %6 + %8 = or <8 x i32> %7, %3 + %bc = bitcast <8 x i32> %8 to <4 x i64> + ret <4 x i64> %bc +} + +define <8 x i64> @combine_var_ror_epi32_512(<8 x i64> %a, <8 x i64> %b) { +; AVX512-LABEL: combine_var_ror_epi32_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %a0 = bitcast <8 x i64> %a to <16 x i32> + %b0 = bitcast <8 x i64> %b to <16 x i32> + %1 = and <16 x i32> %b0, + %2 = sub nsw <16 x i32> , %1 + %3 = lshr <16 x i32> %a0, %1 + %4 = icmp ult <16 x i32> %2, + %5 = select <16 x i1> %4, <16 x i32> %a0, <16 x i32> zeroinitializer + %6 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> zeroinitializer + %7 = shl <16 x i32> %5, %6 + %8 = or <16 x i32> %7, %3 + %bc = bitcast <16 x i32> %8 to <8 x i64> + ret <8 x i64> %bc +} + +define <2 x i64> @combine_var_rol_epi64_128(<2 x i64> %a, <2 x i64> %b) { +; AVX512-LABEL: combine_var_rol_epi64_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vprolvq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = and <2 x i64> %b, + %2 = sub nsw <2 x i64> , %1 + %3 = shl <2 x i64> %a, %1 + %4 = icmp ult <2 x i64> %2, + %5 = select <2 x i1> %4, <2 x i64> %a, <2 x i64> zeroinitializer + %6 = select <2 x i1> %4, <2 x i64> %2, <2 x i64> zeroinitializer + %7 = lshr <2 x i64> %5, %6 + %8 = or <2 x i64> %7, %3 + ret <2 x i64> %8 +} + +define <4 x i64> @combine_var_rol_epi64_256(<4 x i64> %a, <4 x i64> %b) { +; AVX512-LABEL: combine_var_rol_epi64_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vprolvq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %1 = and <4 x i64> %b, + %2 = sub nsw <4 x i64> , %1 + %3 = shl <4 x i64> %a, %1 + %4 = icmp ult <4 x i64> %2, + %5 = select <4 x i1> %4, <4 x i64> %a, <4 x i64> zeroinitializer + %6 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer + %7 = lshr <4 x i64> %5, %6 + %8 = or <4 x i64> %7, %3 + ret <4 x i64> %8 +} + +define <8 x i64> @combine_var_rol_epi64_512(<8 x i64> %a, <8 x i64> %b) { +; AVX512-LABEL: combine_var_rol_epi64_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %1 = and <8 x i64> %b, + %2 = sub nsw <8 x i64> , %1 + %3 = shl <8 x i64> %a, %1 + %4 = icmp ult <8 x i64> %2, + %5 = select <8 x i1> %4, <8 x i64> %a, <8 x i64> zeroinitializer + %6 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer + %7 = lshr <8 x i64> %5, %6 + %8 = or <8 x i64> %7, %3 + ret <8 x i64> %8 +} + +define <2 x i64> @combine_var_ror_epi64_128(<2 x i64> %a, <2 x i64> %b) { +; AVX512-LABEL: combine_var_ror_epi64_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vprorvq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = and <2 x i64> %b, + %2 = sub nsw <2 x i64> , %1 + %3 = lshr <2 x i64> %a, %1 + %4 = icmp ult <2 x i64> %2, + %5 = select <2 x i1> %4, <2 x i64> %a, <2 x i64> zeroinitializer + %6 = select <2 x i1> %4, <2 x i64> %2, <2 x i64> zeroinitializer + %7 = shl <2 x i64> %5, %6 + %8 = or <2 x i64> %7, %3 + ret <2 x i64> %8 +} + +define <4 x i64> @combine_var_ror_epi64_256(<4 x i64> %a, <4 x i64> %b) { +; AVX512-LABEL: combine_var_ror_epi64_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vprorvq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %1 = and <4 x i64> %b, + %2 = sub nsw <4 x i64> , %1 + %3 = lshr <4 x i64> %a, %1 + %4 = icmp ult <4 x i64> %2, + %5 = select <4 x i1> %4, <4 x i64> %a, <4 x i64> zeroinitializer + %6 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer + %7 = shl <4 x i64> %5, %6 + %8 = or <4 x i64> %7, %3 + ret <4 x i64> %8 +} + +define <8 x i64> @combine_var_ror_epi64_512(<8 x i64> %a, <8 x i64> %b) { +; AVX512-LABEL: combine_var_ror_epi64_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vprorvq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %1 = and <8 x i64> %b, + %2 = sub nsw <8 x i64> , %1 + %3 = lshr <8 x i64> %a, %1 + %4 = icmp ult <8 x i64> %2, + %5 = select <8 x i1> %4, <8 x i64> %a, <8 x i64> zeroinitializer + %6 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer + %7 = shl <8 x i64> %5, %6 + %8 = or <8 x i64> %7, %3 + ret <8 x i64> %8 +} + +define <2 x i64> @combine_var_mask_rol_epi32_128(<2 x i64> %w, i8 zeroext %u, <2 x i64> %a, <2 x i64> %b) { +; AVX512-LABEL: combine_var_mask_rol_epi32_128: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vprolvd %xmm2, %xmm1, %xmm0 {%k1} +; AVX512-NEXT: retq + %a0 = bitcast <2 x i64> %a to <4 x i32> + %b0 = bitcast <2 x i64> %b to <4 x i32> + %w0 = bitcast <2 x i64> %w to <4 x i32> + %1 = and <4 x i32> %b0, + %2 = sub nsw <4 x i32> , %1 + %3 = shl <4 x i32> %a0, %1 + %4 = icmp ult <4 x i32> %2, + %5 = select <4 x i1> %4, <4 x i32> %a0, <4 x i32> zeroinitializer + %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer + %7 = lshr <4 x i32> %5, %6 + %8 = or <4 x i32> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <4 x i32> + %11 = select <4 x i1> %10, <4 x i32> %8, <4 x i32> %w0 + %bc = bitcast <4 x i32> %11 to <2 x i64> + ret <2 x i64> %bc +} + +define <4 x i64> @combine_var_mask_rol_epi32_256(<4 x i64> %w, i8 zeroext %u, <4 x i64> %a, <4 x i64> %b) { +; AVX512-LABEL: combine_var_mask_rol_epi32_256: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vprolvd %ymm2, %ymm1, %ymm0 {%k1} +; AVX512-NEXT: retq + %a0 = bitcast <4 x i64> %a to <8 x i32> + %b0 = bitcast <4 x i64> %b to <8 x i32> + %w0 = bitcast <4 x i64> %w to <8 x i32> + %1 = and <8 x i32> %b0, + %2 = sub nsw <8 x i32> , %1 + %3 = shl <8 x i32> %a0, %1 + %4 = icmp ult <8 x i32> %2, + %5 = select <8 x i1> %4, <8 x i32> %a0, <8 x i32> zeroinitializer + %6 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> zeroinitializer + %7 = lshr <8 x i32> %5, %6 + %8 = or <8 x i32> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = select <8 x i1> %9, <8 x i32> %8, <8 x i32> %w0 + %bc = bitcast <8 x i32> %10 to <4 x i64> + ret <4 x i64> %bc +} + +define <8 x i64> @combine_var_mask_rol_epi32_512(<8 x i64> %w, i16 zeroext %u, <8 x i64> %a, <8 x i64> %b) { +; AVX512-LABEL: combine_var_mask_rol_epi32_512: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1} +; AVX512-NEXT: retq + %a0 = bitcast <8 x i64> %a to <16 x i32> + %b0 = bitcast <8 x i64> %b to <16 x i32> + %w0 = bitcast <8 x i64> %w to <16 x i32> + %1 = and <16 x i32> %b0, + %2 = sub nsw <16 x i32> , %1 + %3 = shl <16 x i32> %a0, %1 + %4 = icmp ult <16 x i32> %2, + %5 = select <16 x i1> %4, <16 x i32> %a0, <16 x i32> zeroinitializer + %6 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> zeroinitializer + %7 = lshr <16 x i32> %5, %6 + %8 = or <16 x i32> %7, %3 + %9 = bitcast i16 %u to <16 x i1> + %10 = select <16 x i1> %9, <16 x i32> %8, <16 x i32> %w0 + %bc = bitcast <16 x i32> %10 to <8 x i64> + ret <8 x i64> %bc +} + +define <2 x i64> @combine_var_mask_ror_epi32_128(<2 x i64> %w, i8 zeroext %u, <2 x i64> %a, <2 x i64> %b) { +; AVX512-LABEL: combine_var_mask_ror_epi32_128: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vprorvd %xmm2, %xmm1, %xmm0 {%k1} +; AVX512-NEXT: retq + %a0 = bitcast <2 x i64> %a to <4 x i32> + %b0 = bitcast <2 x i64> %b to <4 x i32> + %w0 = bitcast <2 x i64> %w to <4 x i32> + %1 = and <4 x i32> %b0, + %2 = sub nsw <4 x i32> , %1 + %3 = lshr <4 x i32> %a0, %1 + %4 = icmp ult <4 x i32> %2, + %5 = select <4 x i1> %4, <4 x i32> %a0, <4 x i32> zeroinitializer + %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer + %7 = shl <4 x i32> %5, %6 + %8 = or <4 x i32> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <4 x i32> + %11 = select <4 x i1> %10, <4 x i32> %8, <4 x i32> %w0 + %bc = bitcast <4 x i32> %11 to <2 x i64> + ret <2 x i64> %bc +} + +define <4 x i64> @combine_var_mask_ror_epi32_256(<4 x i64> %w, i8 zeroext %u, <4 x i64> %a, <4 x i64> %b) { +; AVX512-LABEL: combine_var_mask_ror_epi32_256: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vprorvd %ymm2, %ymm1, %ymm0 {%k1} +; AVX512-NEXT: retq + %a0 = bitcast <4 x i64> %a to <8 x i32> + %b0 = bitcast <4 x i64> %b to <8 x i32> + %w0 = bitcast <4 x i64> %w to <8 x i32> + %1 = and <8 x i32> %b0, + %2 = sub nsw <8 x i32> , %1 + %3 = lshr <8 x i32> %a0, %1 + %4 = icmp ult <8 x i32> %2, + %5 = select <8 x i1> %4, <8 x i32> %a0, <8 x i32> zeroinitializer + %6 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> zeroinitializer + %7 = shl <8 x i32> %5, %6 + %8 = or <8 x i32> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = select <8 x i1> %9, <8 x i32> %8, <8 x i32> %w0 + %bc = bitcast <8 x i32> %10 to <4 x i64> + ret <4 x i64> %bc +} + +define <8 x i64> @combine_var_mask_ror_epi32_512(<8 x i64> %w, i16 zeroext %u, <8 x i64> %a, <8 x i64> %b) { +; AVX512-LABEL: combine_var_mask_ror_epi32_512: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1} +; AVX512-NEXT: retq + %a0 = bitcast <8 x i64> %a to <16 x i32> + %b0 = bitcast <8 x i64> %b to <16 x i32> + %w0 = bitcast <8 x i64> %w to <16 x i32> + %1 = and <16 x i32> %b0, + %2 = sub nsw <16 x i32> , %1 + %3 = lshr <16 x i32> %a0, %1 + %4 = icmp ult <16 x i32> %2, + %5 = select <16 x i1> %4, <16 x i32> %a0, <16 x i32> zeroinitializer + %6 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> zeroinitializer + %7 = shl <16 x i32> %5, %6 + %8 = or <16 x i32> %7, %3 + %9 = bitcast i16 %u to <16 x i1> + %10 = select <16 x i1> %9, <16 x i32> %8, <16 x i32> %w0 + %bc = bitcast <16 x i32> %10 to <8 x i64> + ret <8 x i64> %bc +} + +define <2 x i64> @combine_var_mask_rol_epi64_128(<2 x i64> %w, i8 zeroext %u, <2 x i64> %a, <2 x i64> %b) { +; AVX512-LABEL: combine_var_mask_rol_epi64_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vprolvq %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: retq + %1 = and <2 x i64> %b, + %2 = sub nsw <2 x i64> , %1 + %3 = shl <2 x i64> %a, %1 + %4 = icmp ult <2 x i64> %2, + %5 = select <2 x i1> %4, <2 x i64> %a, <2 x i64> zeroinitializer + %6 = select <2 x i1> %4, <2 x i64> %2, <2 x i64> zeroinitializer + %7 = lshr <2 x i64> %5, %6 + %8 = or <2 x i64> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <2 x i32> + %11 = select <2 x i1> %10, <2 x i64> %8, <2 x i64> %w + ret <2 x i64> %8 +} + +define <4 x i64> @combine_var_mask_rol_epi64_256(<4 x i64> %w, i8 zeroext %u, <4 x i64> %a, <4 x i64> %b) { +; AVX512-LABEL: combine_var_mask_rol_epi64_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vprolvq %ymm2, %ymm1, %ymm0 +; AVX512-NEXT: retq + %1 = and <4 x i64> %b, + %2 = sub nsw <4 x i64> , %1 + %3 = shl <4 x i64> %a, %1 + %4 = icmp ult <4 x i64> %2, + %5 = select <4 x i1> %4, <4 x i64> %a, <4 x i64> zeroinitializer + %6 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer + %7 = lshr <4 x i64> %5, %6 + %8 = or <4 x i64> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <4 x i32> + %11 = select <4 x i1> %10, <4 x i64> %8, <4 x i64> %w + ret <4 x i64> %8 +} + +define <8 x i64> @combine_var_mask_rol_epi64_512(<8 x i64> %w, i8 zeroext %u, <8 x i64> %a, <8 x i64> %b) { +; AVX512-LABEL: combine_var_mask_rol_epi64_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vprolvq %zmm2, %zmm1, %zmm0 +; AVX512-NEXT: retq + %1 = and <8 x i64> %b, + %2 = sub nsw <8 x i64> , %1 + %3 = shl <8 x i64> %a, %1 + %4 = icmp ult <8 x i64> %2, + %5 = select <8 x i1> %4, <8 x i64> %a, <8 x i64> zeroinitializer + %6 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer + %7 = lshr <8 x i64> %5, %6 + %8 = or <8 x i64> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = select <8 x i1> %9, <8 x i64> %8, <8 x i64> %w + ret <8 x i64> %8 +} + +define <2 x i64> @combine_var_mask_ror_epi64_128(<2 x i64> %w, i8 zeroext %u, <2 x i64> %a, <2 x i64> %b) { +; AVX512-LABEL: combine_var_mask_ror_epi64_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vprorvq %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: retq + %1 = and <2 x i64> %b, + %2 = sub nsw <2 x i64> , %1 + %3 = lshr <2 x i64> %a, %1 + %4 = icmp ult <2 x i64> %2, + %5 = select <2 x i1> %4, <2 x i64> %a, <2 x i64> zeroinitializer + %6 = select <2 x i1> %4, <2 x i64> %2, <2 x i64> zeroinitializer + %7 = shl <2 x i64> %5, %6 + %8 = or <2 x i64> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <2 x i32> + %11 = select <2 x i1> %10, <2 x i64> %8, <2 x i64> %w + ret <2 x i64> %8 +} + +define <4 x i64> @combine_var_mask_ror_epi64_256(<4 x i64> %w, i8 zeroext %u, <4 x i64> %a, <4 x i64> %b) { +; AVX512-LABEL: combine_var_mask_ror_epi64_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vprorvq %ymm2, %ymm1, %ymm0 +; AVX512-NEXT: retq + %1 = and <4 x i64> %b, + %2 = sub nsw <4 x i64> , %1 + %3 = lshr <4 x i64> %a, %1 + %4 = icmp ult <4 x i64> %2, + %5 = select <4 x i1> %4, <4 x i64> %a, <4 x i64> zeroinitializer + %6 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer + %7 = shl <4 x i64> %5, %6 + %8 = or <4 x i64> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <4 x i32> + %11 = select <4 x i1> %10, <4 x i64> %8, <4 x i64> %w + ret <4 x i64> %8 +} + +define <8 x i64> @combine_var_mask_ror_epi64_512(<8 x i64> %w, i8 zeroext %u, <8 x i64> %a, <8 x i64> %b) { +; AVX512-LABEL: combine_var_mask_ror_epi64_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vprorvq %zmm2, %zmm1, %zmm0 +; AVX512-NEXT: retq + %1 = and <8 x i64> %b, + %2 = sub nsw <8 x i64> , %1 + %3 = lshr <8 x i64> %a, %1 + %4 = icmp ult <8 x i64> %2, + %5 = select <8 x i1> %4, <8 x i64> %a, <8 x i64> zeroinitializer + %6 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer + %7 = shl <8 x i64> %5, %6 + %8 = or <8 x i64> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = select <8 x i1> %9, <8 x i64> %8, <8 x i64> %w + ret <8 x i64> %8 +} + +define <2 x i64> @combine_var_maskz_rol_epi32_128(i8 zeroext %u, <2 x i64> %a, <2 x i64> %b) { +; AVX512-LABEL: combine_var_maskz_rol_epi32_128: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: retq + %a0 = bitcast <2 x i64> %a to <4 x i32> + %b0 = bitcast <2 x i64> %b to <4 x i32> + %1 = and <4 x i32> %b0, + %2 = sub nsw <4 x i32> , %1 + %3 = shl <4 x i32> %a0, %1 + %4 = icmp ult <4 x i32> %2, + %5 = select <4 x i1> %4, <4 x i32> %a0, <4 x i32> zeroinitializer + %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer + %7 = lshr <4 x i32> %5, %6 + %8 = or <4 x i32> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <4 x i32> + %11 = select <4 x i1> %10, <4 x i32> %8, <4 x i32> zeroinitializer + %bc = bitcast <4 x i32> %11 to <2 x i64> + ret <2 x i64> %bc +} + +define <4 x i64> @combine_var_maskz_rol_epi32_256(i8 zeroext %u, <4 x i64> %a, <4 x i64> %b) { +; AVX512-LABEL: combine_var_maskz_rol_epi32_256: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: retq + %a0 = bitcast <4 x i64> %a to <8 x i32> + %b0 = bitcast <4 x i64> %b to <8 x i32> + %1 = and <8 x i32> %b0, + %2 = sub nsw <8 x i32> , %1 + %3 = shl <8 x i32> %a0, %1 + %4 = icmp ult <8 x i32> %2, + %5 = select <8 x i1> %4, <8 x i32> %a0, <8 x i32> zeroinitializer + %6 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> zeroinitializer + %7 = lshr <8 x i32> %5, %6 + %8 = or <8 x i32> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = select <8 x i1> %9, <8 x i32> %8, <8 x i32> zeroinitializer + %bc = bitcast <8 x i32> %10 to <4 x i64> + ret <4 x i64> %bc +} + +define <8 x i64> @combine_var_maskz_rol_epi32_512(i16 zeroext %u, <8 x i64> %a, <8 x i64> %b) { +; AVX512-LABEL: combine_var_maskz_rol_epi32_512: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: retq + %a0 = bitcast <8 x i64> %a to <16 x i32> + %b0 = bitcast <8 x i64> %b to <16 x i32> + %1 = and <16 x i32> %b0, + %2 = sub nsw <16 x i32> , %1 + %3 = shl <16 x i32> %a0, %1 + %4 = icmp ult <16 x i32> %2, + %5 = select <16 x i1> %4, <16 x i32> %a0, <16 x i32> zeroinitializer + %6 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> zeroinitializer + %7 = lshr <16 x i32> %5, %6 + %8 = or <16 x i32> %7, %3 + %9 = bitcast i16 %u to <16 x i1> + %10 = select <16 x i1> %9, <16 x i32> %8, <16 x i32> zeroinitializer + %bc = bitcast <16 x i32> %10 to <8 x i64> + ret <8 x i64> %bc +} + +define <2 x i64> @combine_var_maskz_ror_epi32_128(i8 zeroext %u, <2 x i64> %a, <2 x i64> %b) { +; AVX512-LABEL: combine_var_maskz_ror_epi32_128: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: retq + %a0 = bitcast <2 x i64> %a to <4 x i32> + %b0 = bitcast <2 x i64> %b to <4 x i32> + %1 = and <4 x i32> %b0, + %2 = sub nsw <4 x i32> , %1 + %3 = lshr <4 x i32> %a0, %1 + %4 = icmp ult <4 x i32> %2, + %5 = select <4 x i1> %4, <4 x i32> %a0, <4 x i32> zeroinitializer + %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer + %7 = shl <4 x i32> %5, %6 + %8 = or <4 x i32> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <4 x i32> + %11 = select <4 x i1> %10, <4 x i32> %8, <4 x i32> zeroinitializer + %bc = bitcast <4 x i32> %11 to <2 x i64> + ret <2 x i64> %bc +} + +define <4 x i64> @combine_var_maskz_ror_epi32_256(i8 zeroext %u, <4 x i64> %a, <4 x i64> %b) { +; AVX512-LABEL: combine_var_maskz_ror_epi32_256: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: retq + %a0 = bitcast <4 x i64> %a to <8 x i32> + %b0 = bitcast <4 x i64> %b to <8 x i32> + %1 = and <8 x i32> %b0, + %2 = sub nsw <8 x i32> , %1 + %3 = lshr <8 x i32> %a0, %1 + %4 = icmp ult <8 x i32> %2, + %5 = select <8 x i1> %4, <8 x i32> %a0, <8 x i32> zeroinitializer + %6 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> zeroinitializer + %7 = shl <8 x i32> %5, %6 + %8 = or <8 x i32> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = select <8 x i1> %9, <8 x i32> %8, <8 x i32> zeroinitializer + %bc = bitcast <8 x i32> %10 to <4 x i64> + ret <4 x i64> %bc +} + +define <8 x i64> @combine_var_maskz_ror_epi32_512(i16 zeroext %u, <8 x i64> %a, <8 x i64> %b) { +; AVX512-LABEL: combine_var_maskz_ror_epi32_512: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: retq + %a0 = bitcast <8 x i64> %a to <16 x i32> + %b0 = bitcast <8 x i64> %b to <16 x i32> + %1 = and <16 x i32> %b0, + %2 = sub nsw <16 x i32> , %1 + %3 = lshr <16 x i32> %a0, %1 + %4 = icmp ult <16 x i32> %2, + %5 = select <16 x i1> %4, <16 x i32> %a0, <16 x i32> zeroinitializer + %6 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> zeroinitializer + %7 = shl <16 x i32> %5, %6 + %8 = or <16 x i32> %7, %3 + %9 = bitcast i16 %u to <16 x i1> + %10 = select <16 x i1> %9, <16 x i32> %8, <16 x i32> zeroinitializer + %bc = bitcast <16 x i32> %10 to <8 x i64> + ret <8 x i64> %bc +} + +define <2 x i64> @combine_var_maskz_rol_epi64_128(i8 zeroext %u, <2 x i64> %a, <2 x i64> %b) { +; AVX512-LABEL: combine_var_maskz_rol_epi64_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vprolvq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = and <2 x i64> %b, + %2 = sub nsw <2 x i64> , %1 + %3 = shl <2 x i64> %a, %1 + %4 = icmp ult <2 x i64> %2, + %5 = select <2 x i1> %4, <2 x i64> %a, <2 x i64> zeroinitializer + %6 = select <2 x i1> %4, <2 x i64> %2, <2 x i64> zeroinitializer + %7 = lshr <2 x i64> %5, %6 + %8 = or <2 x i64> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <2 x i32> + %11 = select <2 x i1> %10, <2 x i64> %8, <2 x i64> zeroinitializer + ret <2 x i64> %8 +} + +define <4 x i64> @combine_var_maskz_rol_epi64_256(i8 zeroext %u, <4 x i64> %a, <4 x i64> %b) { +; AVX512-LABEL: combine_var_maskz_rol_epi64_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vprolvq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %1 = and <4 x i64> %b, + %2 = sub nsw <4 x i64> , %1 + %3 = shl <4 x i64> %a, %1 + %4 = icmp ult <4 x i64> %2, + %5 = select <4 x i1> %4, <4 x i64> %a, <4 x i64> zeroinitializer + %6 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer + %7 = lshr <4 x i64> %5, %6 + %8 = or <4 x i64> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <4 x i32> + %11 = select <4 x i1> %10, <4 x i64> %8, <4 x i64> zeroinitializer + ret <4 x i64> %8 +} + +define <8 x i64> @combine_var_maskz_rol_epi64_512(i8 zeroext %u, <8 x i64> %a, <8 x i64> %b) { +; AVX512-LABEL: combine_var_maskz_rol_epi64_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %1 = and <8 x i64> %b, + %2 = sub nsw <8 x i64> , %1 + %3 = shl <8 x i64> %a, %1 + %4 = icmp ult <8 x i64> %2, + %5 = select <8 x i1> %4, <8 x i64> %a, <8 x i64> zeroinitializer + %6 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer + %7 = lshr <8 x i64> %5, %6 + %8 = or <8 x i64> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = select <8 x i1> %9, <8 x i64> %8, <8 x i64> zeroinitializer + ret <8 x i64> %8 +} + +define <2 x i64> @combine_var_maskz_ror_epi64_128(i8 zeroext %u, <2 x i64> %a, <2 x i64> %b) { +; AVX512-LABEL: combine_var_maskz_ror_epi64_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vprorvq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = and <2 x i64> %b, + %2 = sub nsw <2 x i64> , %1 + %3 = lshr <2 x i64> %a, %1 + %4 = icmp ult <2 x i64> %2, + %5 = select <2 x i1> %4, <2 x i64> %a, <2 x i64> zeroinitializer + %6 = select <2 x i1> %4, <2 x i64> %2, <2 x i64> zeroinitializer + %7 = shl <2 x i64> %5, %6 + %8 = or <2 x i64> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <2 x i32> + %11 = select <2 x i1> %10, <2 x i64> %8, <2 x i64> zeroinitializer + ret <2 x i64> %8 +} + +define <4 x i64> @combine_var_maskz_ror_epi64_256(i8 zeroext %u, <4 x i64> %a, <4 x i64> %b) { +; AVX512-LABEL: combine_var_maskz_ror_epi64_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vprorvq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %1 = and <4 x i64> %b, + %2 = sub nsw <4 x i64> , %1 + %3 = lshr <4 x i64> %a, %1 + %4 = icmp ult <4 x i64> %2, + %5 = select <4 x i1> %4, <4 x i64> %a, <4 x i64> zeroinitializer + %6 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer + %7 = shl <4 x i64> %5, %6 + %8 = or <4 x i64> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = shufflevector <8 x i1> %9, <8 x i1> undef, <4 x i32> + %11 = select <4 x i1> %10, <4 x i64> %8, <4 x i64> zeroinitializer + ret <4 x i64> %8 +} + +define <8 x i64> @combine_var_maskz_ror_epi64_512(i8 zeroext %u, <8 x i64> %a, <8 x i64> %b) { +; AVX512-LABEL: combine_var_maskz_ror_epi64_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vprorvq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %1 = and <8 x i64> %b, + %2 = sub nsw <8 x i64> , %1 + %3 = lshr <8 x i64> %a, %1 + %4 = icmp ult <8 x i64> %2, + %5 = select <8 x i1> %4, <8 x i64> %a, <8 x i64> zeroinitializer + %6 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer + %7 = shl <8 x i64> %5, %6 + %8 = or <8 x i64> %7, %3 + %9 = bitcast i8 %u to <8 x i1> + %10 = select <8 x i1> %9, <8 x i64> %8, <8 x i64> zeroinitializer + ret <8 x i64> %8 +} Index: test/Transforms/InstCombine/X86/x86-rotates.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/X86/x86-rotates.ll @@ -0,0 +1,412 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +define <4 x i32> @avx512_mask_prol_d_128(<4 x i32> %v, <4 x i32> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_prol_d_128( +; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[V:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[V]], +; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP3]], <4 x i32> [[SRC:%.*]] +; CHECK-NEXT: ret <4 x i32> [[TMP5]] +; + %1 = tail call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %v, i32 5, <4 x i32> %src, i8 %mask) + ret <4 x i32> %1 +} + +define <4 x i32> @avx512_mask_pror_d_128(<4 x i32> %v, <4 x i32> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_pror_d_128( +; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[V:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[V]], +; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP3]], <4 x i32> [[SRC:%.*]] +; CHECK-NEXT: ret <4 x i32> [[TMP5]] +; + %1 = tail call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %v, i32 5, <4 x i32> %src, i8 %mask) + ret <4 x i32> %1 +} + +define <8 x i32> @avx512_mask_prol_d_256(<8 x i32> %v, <8 x i32> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_prol_d_256( +; CHECK-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[V:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[V]], +; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP3]], <8 x i32> [[SRC:%.*]] +; CHECK-NEXT: ret <8 x i32> [[TMP5]] +; + %1 = tail call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %v, i32 5, <8 x i32> %src, i8 %mask) + ret <8 x i32> %1 +} + +define <8 x i32> @avx512_mask_pror_d_256(<8 x i32> %v, <8 x i32> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_pror_d_256( +; CHECK-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[V:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[V]], +; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP3]], <8 x i32> [[SRC:%.*]] +; CHECK-NEXT: ret <8 x i32> [[TMP5]] +; + %1 = tail call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %v, i32 5, <8 x i32> %src, i8 %mask) + ret <8 x i32> %1 +} + +define <16 x i32> @avx512_mask_prol_d_512(<16 x i32> %v, <16 x i32> %src, i16 %mask) { +; CHECK-LABEL: @avx512_mask_prol_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = shl <16 x i32> [[V:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = lshr <16 x i32> [[V]], +; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP3]], <16 x i32> [[SRC:%.*]] +; CHECK-NEXT: ret <16 x i32> [[TMP5]] +; + %1 = tail call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %v, i32 5, <16 x i32> %src, i16 %mask) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_mask_pror_d_512(<16 x i32> %v, <16 x i32> %src, i16 %mask) { +; CHECK-LABEL: @avx512_mask_pror_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = shl <16 x i32> [[V:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = lshr <16 x i32> [[V]], +; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP3]], <16 x i32> [[SRC:%.*]] +; CHECK-NEXT: ret <16 x i32> [[TMP5]] +; + %1 = tail call <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32> %v, i32 5, <16 x i32> %src, i16 %mask) + ret <16 x i32> %1 +} + +define <2 x i64> @avx512_mask_prol_q_128(<2 x i64> %v, <2 x i64> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_prol_q_128( +; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i64> [[V:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[V]], +; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP3]], <2 x i64> [[SRC:%.*]] +; CHECK-NEXT: ret <2 x i64> [[TMP5]] +; + %1 = tail call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %v, i32 5, <2 x i64> %src, i8 %mask) + ret <2 x i64> %1 +} + +define <2 x i64> @avx512_mask_pror_q_128(<2 x i64> %v, <2 x i64> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_pror_q_128( +; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i64> [[V:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[V]], +; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP3]], <2 x i64> [[SRC:%.*]] +; CHECK-NEXT: ret <2 x i64> [[TMP5]] +; + %1 = tail call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %v, i32 5, <2 x i64> %src, i8 %mask) + ret <2 x i64> %1 +} + +define <4 x i64> @avx512_mask_prol_q_256(<4 x i64> %v, <4 x i64> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_prol_q_256( +; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i64> [[V:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i64> [[V]], +; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP3]], <4 x i64> [[SRC:%.*]] +; CHECK-NEXT: ret <4 x i64> [[TMP5]] +; + %1 = tail call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %v, i32 5, <4 x i64> %src, i8 %mask) + ret <4 x i64> %1 +} + +define <4 x i64> @avx512_mask_pror_q_256(<4 x i64> %v, <4 x i64> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_pror_q_256( +; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i64> [[V:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i64> [[V]], +; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP3]], <4 x i64> [[SRC:%.*]] +; CHECK-NEXT: ret <4 x i64> [[TMP5]] +; + %1 = tail call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %v, i32 5, <4 x i64> %src, i8 %mask) + ret <4 x i64> %1 +} + +define <8 x i64> @avx512_mask_prol_q_512(<8 x i64> %v, <8 x i64> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_prol_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = shl <8 x i64> [[V:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i64> [[V]], +; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP3]], <8 x i64> [[SRC:%.*]] +; CHECK-NEXT: ret <8 x i64> [[TMP5]] +; + %1 = tail call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %v, i32 5, <8 x i64> %src, i8 %mask) + ret <8 x i64> %1 +} + +define <8 x i64> @avx512_mask_pror_q_512(<8 x i64> %v, <8 x i64> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_pror_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = shl <8 x i64> [[V:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i64> [[V]], +; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP3]], <8 x i64> [[SRC:%.*]] +; CHECK-NEXT: ret <8 x i64> [[TMP5]] +; + %1 = tail call <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64> %v, i32 5, <8 x i64> %src, i8 %mask) + ret <8 x i64> %1 +} + +define <4 x i32> @avx512_mask_prolv_d_128(<4 x i32> %v, <4 x i32> %count, <4 x i32> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_prolv_d_128( +; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[COUNT:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[V:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <4 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[V]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = lshr <4 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP3]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP8]], <4 x i32> [[SRC:%.*]] +; CHECK-NEXT: ret <4 x i32> [[TMP10]] +; + %1 = tail call <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32> %v, <4 x i32> %count, <4 x i32> %src, i8 %mask) + ret <4 x i32> %1 +} + +define <4 x i32> @avx512_mask_prorv_d_128(<4 x i32> %v, <4 x i32> %count, <4 x i32> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_prorv_d_128( +; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[COUNT:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[V:%.*]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = shl <4 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = lshr <4 x i32> [[V]], [[TMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP8]], <4 x i32> [[SRC:%.*]] +; CHECK-NEXT: ret <4 x i32> [[TMP10]] +; + %1 = tail call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %v, <4 x i32> %count, <4 x i32> %src, i8 %mask) + ret <4 x i32> %1 +} + +define <8 x i32> @avx512_mask_prolv_d_256(<8 x i32> %v, <8 x i32> %count, <8 x i32> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_prolv_d_256( +; CHECK-NEXT: [[TMP1:%.*]] = and <8 x i32> [[COUNT:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <8 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shl <8 x i32> [[V:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <8 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[V]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP2]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = lshr <8 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP3]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> [[TMP8]], <8 x i32> [[SRC:%.*]] +; CHECK-NEXT: ret <8 x i32> [[TMP10]] +; + %1 = tail call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %v, <8 x i32> %count, <8 x i32> %src, i8 %mask) + ret <8 x i32> %1 +} + +define <8 x i32> @avx512_mask_prorv_d_256(<8 x i32> %v, <8 x i32> %count, <8 x i32> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_prorv_d_256( +; CHECK-NEXT: [[TMP1:%.*]] = and <8 x i32> [[COUNT:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <8 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <8 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[V:%.*]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[TMP2]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = shl <8 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = lshr <8 x i32> [[V]], [[TMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> [[TMP8]], <8 x i32> [[SRC:%.*]] +; CHECK-NEXT: ret <8 x i32> [[TMP10]] +; + %1 = tail call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %v, <8 x i32> %count, <8 x i32> %src, i8 %mask) + ret <8 x i32> %1 +} + +define <16 x i32> @avx512_mask_prolv_d_512(<16 x i32> %v, <16 x i32> %count, <16 x i32> %src, i16 %mask) { +; CHECK-LABEL: @avx512_mask_prolv_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i32> [[COUNT:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <16 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i32> [[V:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <16 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[V]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = lshr <16 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP3]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP8]], <16 x i32> [[SRC:%.*]] +; CHECK-NEXT: ret <16 x i32> [[TMP10]] +; + %1 = tail call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %v, <16 x i32> %count, <16 x i32> %src, i16 %mask) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_mask_prorv_d_512(<16 x i32> %v, <16 x i32> %count, <16 x i32> %src, i16 %mask) { +; CHECK-LABEL: @avx512_mask_prorv_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i32> [[COUNT:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <16 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <16 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[V:%.*]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = shl <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = lshr <16 x i32> [[V]], [[TMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP8]], <16 x i32> [[SRC:%.*]] +; CHECK-NEXT: ret <16 x i32> [[TMP10]] +; + %1 = tail call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %v, <16 x i32> %count, <16 x i32> %src, i16 %mask) + ret <16 x i32> %1 +} + +define <2 x i64> @avx512_mask_prolv_q_128(<2 x i64> %v, <2 x i64> %count, <2 x i64> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_prolv_q_128( +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[COUNT:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i64> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shl <2 x i64> [[V:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <2 x i64> [[TMP2]], +; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i64> [[V]], <2 x i64> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP4]], <2 x i64> [[TMP2]], <2 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = lshr <2 x i64> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = or <2 x i64> [[TMP3]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> undef, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP8]], <2 x i64> [[SRC:%.*]] +; CHECK-NEXT: ret <2 x i64> [[TMP10]] +; + %1 = tail call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %v, <2 x i64> %count, <2 x i64> %src, i8 %mask) + ret <2 x i64> %1 +} + +define <2 x i64> @avx512_mask_prorv_q_128(<2 x i64> %v, <2 x i64> %count, <2 x i64> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_prorv_q_128( +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[COUNT:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i64> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <2 x i64> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> [[TMP3]], <2 x i64> [[V:%.*]], <2 x i64> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP3]], <2 x i64> [[TMP2]], <2 x i64> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = shl <2 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = lshr <2 x i64> [[V]], [[TMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = or <2 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> undef, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP8]], <2 x i64> [[SRC:%.*]] +; CHECK-NEXT: ret <2 x i64> [[TMP10]] +; + %1 = tail call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %v, <2 x i64> %count, <2 x i64> %src, i8 %mask) + ret <2 x i64> %1 +} + +define <4 x i64> @avx512_mask_prolv_q_256(<4 x i64> %v, <4 x i64> %count, <4 x i64> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_prolv_q_256( +; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i64> [[COUNT:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i64> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i64> [[V:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <4 x i64> [[TMP2]], +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[V]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP2]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = lshr <4 x i64> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i64> [[TMP3]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP8]], <4 x i64> [[SRC:%.*]] +; CHECK-NEXT: ret <4 x i64> [[TMP10]] +; + %1 = tail call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %v, <4 x i64> %count, <4 x i64> %src, i8 %mask) + ret <4 x i64> %1 +} + +define <4 x i64> @avx512_mask_prorv_q_256(<4 x i64> %v, <4 x i64> %count, <4 x i64> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_prorv_q_256( +; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i64> [[COUNT:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i64> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i64> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i64> [[V:%.*]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP3]], <4 x i64> [[TMP2]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = lshr <4 x i64> [[V]], [[TMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP8]], <4 x i64> [[SRC:%.*]] +; CHECK-NEXT: ret <4 x i64> [[TMP10]] +; + %1 = tail call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %v, <4 x i64> %count, <4 x i64> %src, i8 %mask) + ret <4 x i64> %1 +} + +define <8 x i64> @avx512_mask_prolv_q_512(<8 x i64> %v, <8 x i64> %count, <8 x i64> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_prolv_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = and <8 x i64> [[COUNT:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <8 x i64> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shl <8 x i64> [[V:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <8 x i64> [[TMP2]], +; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[V]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP2]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = lshr <8 x i64> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP3]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP8]], <8 x i64> [[SRC:%.*]] +; CHECK-NEXT: ret <8 x i64> [[TMP10]] +; + %1 = tail call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %v, <8 x i64> %count, <8 x i64> %src, i8 %mask) + ret <8 x i64> %1 +} + +define <8 x i64> @avx512_mask_prorv_q_512(<8 x i64> %v, <8 x i64> %count, <8 x i64> %src, i8 %mask) { +; CHECK-LABEL: @avx512_mask_prorv_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = and <8 x i64> [[COUNT:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <8 x i64> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <8 x i64> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> [[V:%.*]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> [[TMP2]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = shl <8 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = lshr <8 x i64> [[V]], [[TMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP8]], <8 x i64> [[SRC:%.*]] +; CHECK-NEXT: ret <8 x i64> [[TMP10]] +; + %1 = tail call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %v, <8 x i64> %count, <8 x i64> %src, i8 %mask) + ret <8 x i64> %1 +} + + +declare <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32>, i32, <4 x i32>, i8) +declare <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32>, i32, <4 x i32>, i8) +declare <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32>, i32, <8 x i32>, i8) +declare <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32>, i32, <8 x i32>, i8) +declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i32, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32>, i32, <16 x i32>, i16) +declare <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64>, i32, <2 x i64>, i8) +declare <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64>, i32, <2 x i64>, i8) +declare <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64>, i32, <4 x i64>, i8) +declare <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64>, i32, <4 x i64>, i8) +declare <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64>, i32, <8 x i64>, i8) +declare <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64>, i32, <8 x i64>, i8) +declare <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) +declare <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) +declare <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +declare <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +declare <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +declare <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +declare <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) +declare <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) +declare <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)