Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -23137,12 +23137,36 @@ return SDValue(); } +// Patterns emitted from IR might have saturation of count operand +// because shifting an element by more than its bitsize creates a poison value. +// We can get rid of it here. +static SDValue +RemoveSatFromScalarVarShift(SDValue Op, SelectionDAG &DAG, + bool IsArith) { + MVT VT = Op.getSimpleValueType(); + unsigned EltSize = VT.getSizeInBits(); + + if (Op.getOpcode() == ISD::TRUNCATE || + Op.getOpcode() == ISD::ZERO_EXTEND) + Op = Op.getOperand(0); + + if (Op.getOpcode() == ISD::UMIN) { + auto *ConstVal = dyn_cast(Op.getOperand(1)); + if (ConstVal && ((!IsArith && ConstVal->getZExtValue() == EltSize) || + (IsArith && ConstVal->getZExtValue() == (EltSize - 1)))) { + return Op.getOperand(0); + } + } + return SDValue(); +} + static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); + bool IsArith = Op.getOpcode() == ISD::SRA; unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI : (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI; @@ -23160,6 +23184,9 @@ BaseShAmt = BV->getSplatValue(); if (BaseShAmt && BaseShAmt.isUndef()) BaseShAmt = SDValue(); + else if (BaseShAmt) + if (SDValue V = RemoveSatFromScalarVarShift(BaseShAmt, DAG, IsArith)) + return getTargetVShiftNode(X86OpcI, dl, VT, R, V, Subtarget, DAG); } else { if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) Amt = Amt.getOperand(0); @@ -23199,7 +23226,7 @@ } // Check cases (mainly 32-bit) where i64 is expanded into high and low parts. - if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST && + if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST && Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { Amt = Amt.getOperand(0); unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / @@ -33257,6 +33284,167 @@ return SDValue(); } +// Canonicalize pattern created by lowering x86 intrinsic. +// Lowering function requires ISD::UMIN node to match the pattern. +// SelectionDAG created from IR sometimes includes +// (trunc (select setcc, x, y)) fragment which gets optimized to +// (select setcc (trunc x), (trunc y)) +// which prevents combining to smin node. +// This function forces select combine to be done first. +// In case of variable shift, remove the smin node (it was only needed in IR). +static SDValue +foldShiftArithmeticIntrinsicPattern(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + if (Opcode != ISD::SRA) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + if (!VT.isVector()) + return SDValue(); + + EVT SVT = VT.getVectorElementType(); + unsigned EltSize = VT.getScalarSizeInBits(); + SDLoc DL(N); + + if (auto *SVN = dyn_cast(N1)) { + ArrayRef Mask = SVN->getMask(); + for (unsigned i = 0; i < Mask.size(); ++i) + if (Mask[i] != 0) + return SDValue(); + + SDValue Insert = N1.getOperand(0); + if (Insert.getOpcode() != ISD::INSERT_VECTOR_ELT || + N1.getOperand(1).getOpcode() != ISD::UNDEF || + Insert.getOperand(0).getOpcode() != ISD::UNDEF) + return SDValue(); + + auto *ConstZero = dyn_cast(Insert.getOperand(2)); + if (!ConstZero || !ConstZero->isNullValue()) + return SDValue(); + + SDValue Tmp = Insert.getOperand(1); + SDValue BaseShAmt = Tmp; + if (Tmp.getOpcode() == ISD::TRUNCATE || + Tmp.getOpcode() == ISD::ZERO_EXTEND) + BaseShAmt = Tmp.getOperand(0); + + if (BaseShAmt.getOpcode() != ISD::SELECT || + BaseShAmt.getOperand(0).getOpcode() != ISD::SETCC || + BaseShAmt.getOperand(1) != BaseShAmt.getOperand(0).getOperand(0) || + BaseShAmt.getOperand(2) != BaseShAmt.getOperand(0).getOperand(1)) + return SDValue(); + + ISD::CondCode CC = + cast(BaseShAmt.getOperand(0).getOperand(2))->get(); + auto *ConstVal = dyn_cast(BaseShAmt.getOperand(2)); + if (CC != ISD::SETULT || + ConstVal->getZExtValue() != (uint64_t)(EltSize - 1)) + return SDValue(); + + MVT BSATy = BaseShAmt.getSimpleValueType(); + BaseShAmt = DAG.getNode(ISD::UMIN, DL, BSATy, BaseShAmt.getOperand(1), + BaseShAmt.getOperand(2)); + + if (Tmp.getOpcode() != ISD::SELECT) + BaseShAmt = DAG.getNode(Tmp.getOpcode(), DL, SVT, BaseShAmt); + + Insert = DAG.getNode(Insert.getOpcode(), DL, VT, Insert.getOperand(0), + BaseShAmt, Insert.getOperand(2)); + + N1 = DAG.getVectorShuffle(VT, DL, Insert, N1.getOperand(1), Mask); + return DAG.getNode(ISD::SRA, DL, VT, N0, N1); + } else if (N1.getOpcode() == ISD::UMIN) { + APInt SplatValue; + if (ISD::isConstantSplatVector(N1.getOperand(1).getNode(), SplatValue) && + SplatValue.getZExtValue() == (uint64_t)(EltSize - 1)) { + N1 = N1.getOperand(0); + return DAG.getNode(ISD::SRA, DL, VT, N0, N1); + } + } + + return SDValue(); +} + +// Canonicalize pattern created by lowering x86 intrinsic. +// Lowering function requires ISD::SMIN node to match the pattern. +// fold (shl (select (setcc y, max, lt), xvec, zerovec), +// (select (setcc y, max, lt), yvec, zerovec) +// -> (shl xvec, (smin y, max)vec) +// or, in case of variable shift: +// fold (shl (vselect (setcc yvec, maxvec, lt), xvec, zerovec), +// (vselect (setcc yvec, maxvec, lt), yvec, zerovec) +// -> (shl xvec, yvec) +static SDValue foldShiftLogicalIntrinsicPattern(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + if (Opcode != ISD::SHL && Opcode != ISD::SRL) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + if (!VT.isVector()) + return SDValue(); + + EVT SVT = VT.getVectorElementType(); + unsigned OpSizeInBits = VT.getScalarSizeInBits(); + SDLoc DL(N); + + auto CheckCommonPart = [](SDValue N0, SDValue N1) { + SDValue SETCC0 = N0.getOperand(0); + SDValue SETCC1 = N1.getOperand(0); + if (SETCC0.getOpcode() != ISD::SETCC || SETCC0 != SETCC1) + return false; + + if (cast(SETCC0.getOperand(2))->get() != ISD::SETULT) + return false; + + if (!ISD::isBuildVectorAllZeros(N0.getOperand(2).getNode()) || + !ISD::isBuildVectorAllZeros(N1.getOperand(2).getNode())) + return false; + + return true; + }; + + if (N0.getOpcode() == ISD::SELECT && N1.getOpcode() == ISD::SELECT) { + if (!CheckCommonPart(N0, N1)) + return SDValue(); + + SDValue SETCC0 = N0.getOperand(0); + auto *MaxNode = dyn_cast(SETCC0.getOperand(1).getNode()); + if (!MaxNode || MaxNode->getZExtValue() != (uint64_t)OpSizeInBits) + return SDValue(); + + SDValue LHS = N0.getOperand(1); + + SDValue Count = SETCC0.getOperand(0); + SDValue Max = SETCC0.getOperand(1); + EVT CountTy = Count.getValueType(); + Count = DAG.getNode(ISD::UMIN, DL, CountTy, Count, Max); + Count = DAG.getZExtOrTrunc(Count, DL, SVT); + SDValue RHS = DAG.getSplatBuildVector(VT, DL, Count); + + return DAG.getNode(Opcode, DL, VT, LHS, RHS); + } else if (N0.getOpcode() == ISD::VSELECT && N1.getOpcode() == ISD::VSELECT) { + if (!CheckCommonPart(N0, N1)) + return SDValue(); + + SDValue Max = N0.getOperand(0).getOperand(1); + APInt SplatValue; + if (!ISD::isConstantSplatVector(Max.getNode(), SplatValue) || + SplatValue != APInt(OpSizeInBits, OpSizeInBits)) + return SDValue(); + + SDValue LHS = N0.getOperand(1); + SDValue RHS = N1.getOperand(1); + + return DAG.getNode(Opcode, DL, VT, LHS, RHS); + } + + return SDValue(); +} + static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -33313,6 +33501,10 @@ return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); } + // Try to fold pattern emitted by lowering x86 intrinsic. + if (SDValue V = foldShiftLogicalIntrinsicPattern(N, DAG)) + return V; + return SDValue(); } @@ -33322,6 +33514,10 @@ EVT VT = N0.getValueType(); unsigned Size = VT.getSizeInBits(); + // Try to fold pattern emitted by lowering x86 intrinsic. + if (SDValue V = foldShiftArithmeticIntrinsicPattern(N, DAG)) + return V; + // fold (ashr (shl, a, [56,48,32,24,16]), SarConst) // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or // into (lshr, (sext (a), SarConst - [56,48,32,24,16])) @@ -33375,6 +33571,10 @@ SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); + // Try to fold pattern emitted by lowering x86 intrinsic. + if (SDValue V = foldShiftLogicalIntrinsicPattern(N, DAG)) + return V; + // Only do this on the last DAG combine as it can interfere with other // combines. if (!DCI.isAfterLegalizeDAG()) Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -247,79 +247,138 @@ InstCombiner::BuilderTy &Builder) { bool LogicalShift = false; bool ShiftLeft = false; + bool IsCountOperandInteger = false; switch (II.getIntrinsicID()) { default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::x86_sse2_psra_d: case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_sse2_psrai_w: case Intrinsic::x86_avx2_psra_d: case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: case Intrinsic::x86_avx512_psra_q_128: - case Intrinsic::x86_avx512_psrai_q_128: case Intrinsic::x86_avx512_psra_q_256: - case Intrinsic::x86_avx512_psrai_q_256: case Intrinsic::x86_avx512_psra_d_512: case Intrinsic::x86_avx512_psra_q_512: case Intrinsic::x86_avx512_psra_w_512: + LogicalShift = false; ShiftLeft = false; IsCountOperandInteger = false; + break; + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx512_psrai_q_128: + case Intrinsic::x86_avx512_psrai_q_256: case Intrinsic::x86_avx512_psrai_d_512: case Intrinsic::x86_avx512_psrai_q_512: case Intrinsic::x86_avx512_psrai_w_512: - LogicalShift = false; ShiftLeft = false; + LogicalShift = false; ShiftLeft = false; IsCountOperandInteger = true; break; case Intrinsic::x86_sse2_psrl_d: case Intrinsic::x86_sse2_psrl_q: case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_sse2_psrli_w: case Intrinsic::x86_avx2_psrl_d: case Intrinsic::x86_avx2_psrl_q: case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: case Intrinsic::x86_avx512_psrl_d_512: case Intrinsic::x86_avx512_psrl_q_512: case Intrinsic::x86_avx512_psrl_w_512: + LogicalShift = true; ShiftLeft = false; IsCountOperandInteger = false; + break; + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: case Intrinsic::x86_avx512_psrli_d_512: case Intrinsic::x86_avx512_psrli_q_512: case Intrinsic::x86_avx512_psrli_w_512: - LogicalShift = true; ShiftLeft = false; + LogicalShift = true; ShiftLeft = false; IsCountOperandInteger = true; break; case Intrinsic::x86_sse2_psll_d: case Intrinsic::x86_sse2_psll_q: case Intrinsic::x86_sse2_psll_w: - case Intrinsic::x86_sse2_pslli_d: - case Intrinsic::x86_sse2_pslli_q: - case Intrinsic::x86_sse2_pslli_w: case Intrinsic::x86_avx2_psll_d: case Intrinsic::x86_avx2_psll_q: case Intrinsic::x86_avx2_psll_w: - case Intrinsic::x86_avx2_pslli_d: - case Intrinsic::x86_avx2_pslli_q: - case Intrinsic::x86_avx2_pslli_w: case Intrinsic::x86_avx512_psll_d_512: case Intrinsic::x86_avx512_psll_q_512: case Intrinsic::x86_avx512_psll_w_512: + LogicalShift = true; ShiftLeft = true; IsCountOperandInteger = false; + break; + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: case Intrinsic::x86_avx512_pslli_d_512: case Intrinsic::x86_avx512_pslli_q_512: case Intrinsic::x86_avx512_pslli_w_512: - LogicalShift = true; ShiftLeft = true; + LogicalShift = true; ShiftLeft = true; IsCountOperandInteger = true; break; } assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); // Simplify if count is constant. + auto Vec = II.getArgOperand(0); auto Arg1 = II.getArgOperand(1); + auto VT = cast(Vec->getType()); + auto SVT = VT->getElementType(); + unsigned VWidth = VT->getNumElements(); + unsigned BitWidth = SVT->getPrimitiveSizeInBits(); + auto CAZ = dyn_cast(Arg1); auto CDV = dyn_cast(Arg1); auto CInt = dyn_cast(Arg1); - if (!CAZ && !CDV && !CInt) - return nullptr; + + if (!CAZ && !CDV && !CInt) { + // Get count argument. + Value *Count; + if (!IsCountOperandInteger) { + // Number of bits to be shifted is stored in the lowest 64 bits + // of the Arg1 vector. + assert(Arg1->getType()->isVectorTy() && "Count argument expected to be \ + of a vector type."); + // Retrieve the shift value. + VectorType *VTy = VectorType::get(Type::getInt64Ty(II.getContext()), 2); + Arg1 = Builder.CreateBitCast(Arg1, VTy); + Count = Builder.CreateExtractElement(Arg1, (uint64_t)0); + } else { + // Number of bits to be shifted is stored in an unsigned integer. + assert(Arg1->getType()->isIntegerTy() && "Count argument expected to be \ + of an integer type."); + Count = Arg1; + } + + // In IR shift values larger or equal to BitWidth cause + // emission of a poison value, so we need to saturate it + // to BitWidth (or BitWidth-1 in case of arithmetic shift). + Type *CountTy = Count->getType(); + Value *MaxConstVal = ConstantInt::get(CountTy, BitWidth - 1); + + if (!LogicalShift) { + // Handle out of range shifts with min intrinsic. + // Shifting by (BitWidth-1) in arithmetic op won't emit a poison value. + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, Count, MaxConstVal); + Count = Builder.CreateSelect(Cmp, Count, MaxConstVal); + Count = Builder.CreateZExtOrTrunc(Count, SVT); + Value *ShiftVec = Builder.CreateVectorSplat(VWidth, Count); + return Builder.CreateAShr(Vec, ShiftVec); + } else { + // Shifting by BitWidth would emit a poison value in logical shift. + // In cases where Count >= BitWidth, don't do a shift and + // insert a zero value instead. + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULE, Count, MaxConstVal); + Count = Builder.CreateZExtOrTrunc(Count, SVT); + Value *ShiftVec = Builder.CreateVectorSplat(VWidth, Count); + Value *ZeroVec = ConstantAggregateZero::get(VT); + Vec = Builder.CreateSelect(Cmp, Vec, ZeroVec); + ShiftVec = Builder.CreateSelect(Cmp, ShiftVec, ZeroVec); + return ShiftLeft ? Builder.CreateShl(Vec, ShiftVec) : + Builder.CreateLShr(Vec, ShiftVec); + } + } APInt Count(64, 0); if (CDV) { @@ -341,12 +400,6 @@ else if (CInt) Count = CInt->getValue(); - auto Vec = II.getArgOperand(0); - auto VT = cast(Vec->getType()); - auto SVT = VT->getElementType(); - unsigned VWidth = VT->getNumElements(); - unsigned BitWidth = SVT->getPrimitiveSizeInBits(); - // If shift-by-zero then just return the original value. if (Count.isNullValue()) return Vec; @@ -423,17 +476,46 @@ } assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); - // Simplify if all shift amounts are constant/undef. - auto *CShift = dyn_cast(II.getArgOperand(1)); - if (!CShift) - return nullptr; - auto Vec = II.getArgOperand(0); + auto CountVec = II.getArgOperand(1); auto VT = cast(II.getType()); auto SVT = VT->getVectorElementType(); int NumElts = VT->getNumElements(); int BitWidth = SVT->getIntegerBitWidth(); + auto *CShift = dyn_cast(II.getArgOperand(1)); + + // Simplify if all shift amounts are unknown (not constant/undef). + if (!CShift) { + auto VecType = dyn_cast(Vec->getType()); + auto CountVecType = dyn_cast(CountVec->getType()); + if (VecType != CountVecType) + return nullptr; + // Create vector of constant values (BitWidth - 1). + auto ConstVal = ConstantInt::get(SVT, BitWidth - 1); + auto ConstantVecMax = Builder.CreateVectorSplat(NumElts, ConstVal); + + if (!LogicalShift) { + // Handle out of range shifts with min intrinsic. + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, + CountVec, ConstantVecMax); + CountVec = Builder.CreateSelect(Cmp, CountVec, ConstantVecMax); + return Builder.CreateAShr(Vec, CountVec); + } else { + // Element being shifted by BitWidth or more would emit + // a poison value in LLVM IR. + // In that case, a zero value is inserted and shifted by 0 bits. + Value *ZeroVec = ConstantAggregateZero::get(VecType); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULE, + CountVec, ConstantVecMax); + Vec = Builder.CreateSelect(Cmp, Vec, ZeroVec); + CountVec = Builder.CreateSelect(Cmp, CountVec, ZeroVec); + return ShiftLeft ? Builder.CreateShl(Vec, CountVec) : + Builder.CreateLShr(Vec, CountVec); + } + } + + // Simplify if all shift amounts are constant/undef. // Collect each element's shift amount. // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. bool AnyOutOfRange = false; Index: test/CodeGen/X86/combine-shl.ll =================================================================== --- test/CodeGen/X86/combine-shl.ll +++ test/CodeGen/X86/combine-shl.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-SLOW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512 ; fold (shl 0, x) -> 0 define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) { @@ -597,3 +598,290 @@ %2 = shl <4 x i32> %1, ret <4 x i32> %2 } + +; fold (shl (select (setcc y, c, lt), xvec, zerovec), +; (select (setcc y, c, lt), yvec, zerovec)) +; -> (shl xvec, (smin yvec, maxvec)) +define <2 x i64> @combine_vec_shl_min128(<2 x i64> %x, <2 x i64> %y) { +; SSE-LABEL: combine_vec_shl_min128: +; SSE: # %bb.0: +; SSE-NEXT: pslld $23, %xmm1 +; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE-NEXT: pmulld %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_vec_shl_min128: +; AVX: # %bb.0: +; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_vec_shl_min128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = bitcast <2 x i64> %x to <4 x i32> + %2 = bitcast <2 x i64> %y to <4 x i32> + %3 = icmp ult <4 x i32> %2, + %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> zeroinitializer + %5 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer + %6 = shl <4 x i32> %4, %5 + %7 = bitcast <4 x i32> %6 to <2 x i64> + ret <2 x i64> %7 +} + +define <4 x i64> @combine_vec_shl_min256(<4 x i64> %x, <4 x i64> %y) { +; SSE-LABEL: combine_vec_shl_min256: +; SSE: # %bb.0: +; SSE-NEXT: pslld $23, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; SSE-NEXT: paddd %xmm4, %xmm2 +; SSE-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE-NEXT: pmulld %xmm2, %xmm0 +; SSE-NEXT: pslld $23, %xmm3 +; SSE-NEXT: paddd %xmm4, %xmm3 +; SSE-NEXT: cvttps2dq %xmm3, %xmm2 +; SSE-NEXT: pmulld %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_vec_shl_min256: +; AVX: # %bb.0: +; AVX-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_vec_shl_min256: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %1 = bitcast <4 x i64> %x to <8 x i32> + %2 = bitcast <4 x i64> %y to <8 x i32> + %3 = icmp ult <8 x i32> %2, + %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> zeroinitializer + %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer + %6 = shl <8 x i32> %4, %5 + %7 = bitcast <8 x i32> %6 to <4 x i64> + ret <4 x i64> %7 +} + +define <8 x i64> @combine_vec_shl_min512(<8 x i64> %x, <8 x i64> %y) { +; SSE-LABEL: combine_vec_shl_min512: +; SSE: # %bb.0: +; SSE-NEXT: pslld $23, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [1065353216,1065353216,1065353216,1065353216] +; SSE-NEXT: paddd %xmm8, %xmm4 +; SSE-NEXT: cvttps2dq %xmm4, %xmm4 +; SSE-NEXT: pmulld %xmm4, %xmm0 +; SSE-NEXT: pslld $23, %xmm5 +; SSE-NEXT: paddd %xmm8, %xmm5 +; SSE-NEXT: cvttps2dq %xmm5, %xmm4 +; SSE-NEXT: pmulld %xmm4, %xmm1 +; SSE-NEXT: pslld $23, %xmm6 +; SSE-NEXT: paddd %xmm8, %xmm6 +; SSE-NEXT: cvttps2dq %xmm6, %xmm4 +; SSE-NEXT: pmulld %xmm4, %xmm2 +; SSE-NEXT: pslld $23, %xmm7 +; SSE-NEXT: paddd %xmm8, %xmm7 +; SSE-NEXT: cvttps2dq %xmm7, %xmm4 +; SSE-NEXT: pmulld %xmm4, %xmm3 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_vec_shl_min512: +; AVX: # %bb.0: +; AVX-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vpsllvd %ymm3, %ymm1, %ymm1 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_vec_shl_min512: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %1 = bitcast <8 x i64> %x to <16 x i32> + %2 = bitcast <8 x i64> %y to <16 x i32> + %3 = icmp ult <16 x i32> %2, + %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> zeroinitializer + %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer + %6 = shl <16 x i32> %4, %5 + %7 = bitcast <16 x i32> %6 to <8 x i64> + ret <8 x i64> %7 +} + +define <2 x i64> @combine_scalar_shl_i_128(<2 x i64> %x, i32 %y) { +; SSE-LABEL: combine_scalar_shl_i_128: +; SSE: # %bb.0: +; SSE-NEXT: movd %edi, %xmm1 +; SSE-NEXT: pslld %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_shl_i_128: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %edi, %xmm1 +; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_shl_i_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %edi, %xmm1 +; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = bitcast <2 x i64> %x to <4 x i32> + %2 = icmp ult i32 %y, 32 + %3 = insertelement <4 x i32> undef, i32 %y, i32 0 + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer + %5 = select i1 %2, <4 x i32> %1, <4 x i32> zeroinitializer + %6 = select i1 %2, <4 x i32> %4, <4 x i32> zeroinitializer + %7 = shl <4 x i32> %5, %6 + %8 = bitcast <4 x i32> %7 to <2 x i64> + ret <2 x i64> %8 +} + +define <4 x i64> @combine_scalar_shl_i_256(<4 x i64> %x, i32 %y) { +; SSE-LABEL: combine_scalar_shl_i_256: +; SSE: # %bb.0: +; SSE-NEXT: movd %edi, %xmm2 +; SSE-NEXT: pslld %xmm2, %xmm0 +; SSE-NEXT: pslld %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_shl_i_256: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %edi, %xmm1 +; AVX-NEXT: vpslld %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_shl_i_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %edi, %xmm1 +; AVX512-NEXT: vpslld %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %1 = bitcast <4 x i64> %x to <8 x i32> + %2 = icmp ult i32 %y, 32 + %3 = insertelement <8 x i32> undef, i32 %y, i32 0 + %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> zeroinitializer + %5 = select i1 %2, <8 x i32> %1, <8 x i32> zeroinitializer + %6 = select i1 %2, <8 x i32> %4, <8 x i32> zeroinitializer + %7 = shl <8 x i32> %5, %6 + %8 = bitcast <8 x i32> %7 to <4 x i64> + ret <4 x i64> %8 +} + +define <8 x i64> @combine_scalar_shl_i_512(<8 x i64> %x, i32 %y) { +; SSE-LABEL: combine_scalar_shl_i_512: +; SSE: # %bb.0: +; SSE-NEXT: movd %edi, %xmm4 +; SSE-NEXT: pslld %xmm4, %xmm0 +; SSE-NEXT: pslld %xmm4, %xmm1 +; SSE-NEXT: pslld %xmm4, %xmm2 +; SSE-NEXT: pslld %xmm4, %xmm3 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_shl_i_512: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %edi, %xmm2 +; AVX-NEXT: vpslld %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vpslld %xmm2, %ymm1, %ymm1 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_shl_i_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %edi, %xmm1 +; AVX512-NEXT: vpslld %xmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %1 = bitcast <8 x i64> %x to <16 x i32> + %2 = icmp ult i32 %y, 32 + %3 = insertelement <16 x i32> undef, i32 %y, i32 0 + %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> zeroinitializer + %5 = select i1 %2, <16 x i32> %1, <16 x i32> zeroinitializer + %6 = select i1 %2, <16 x i32> %4, <16 x i32> zeroinitializer + %7 = shl <16 x i32> %5, %6 + %8 = bitcast <16 x i32> %7 to <8 x i64> + ret <8 x i64> %8 +} + +define <2 x i64> @combine_scalar_shl_128(<2 x i64> %x, <2 x i64> %y) { +; SSE-LABEL: combine_scalar_shl_128: +; SSE: # %bb.0: +; SSE-NEXT: pslld %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_shl_128: +; AVX: # %bb.0: +; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_shl_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = bitcast <2 x i64> %x to <4 x i32> + %2 = extractelement <2 x i64> %y, i64 0 + %3 = icmp ult i64 %2, 32 + %4 = trunc i64 %2 to i32 + %5 = insertelement <4 x i32> undef, i32 %4, i32 0 + %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer + %7 = select i1 %3, <4 x i32> %1, <4 x i32> zeroinitializer + %8 = select i1 %3, <4 x i32> %6, <4 x i32> zeroinitializer + %9 = shl <4 x i32> %7, %8 + %10 = bitcast <4 x i32> %9 to <2 x i64> + ret <2 x i64> %10 +} + +define <4 x i64> @combine_scalar_shl_256(<4 x i64> %x, <4 x i64> %y) { +; SSE-LABEL: combine_scalar_shl_256: +; SSE: # %bb.0: +; SSE-NEXT: pslld %xmm2, %xmm0 +; SSE-NEXT: pslld %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_shl_256: +; AVX: # %bb.0: +; AVX-NEXT: vpslld %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_shl_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %1 = bitcast <4 x i64> %x to <8 x i32> + %2 = extractelement <4 x i64> %y, i64 0 + %3 = icmp ult i64 %2, 32 + %4 = trunc i64 %2 to i32 + %5 = insertelement <8 x i32> undef, i32 %4, i32 0 + %6 = shufflevector <8 x i32> %5, <8 x i32> undef, <8 x i32> zeroinitializer + %7 = select i1 %3, <8 x i32> %1, <8 x i32> zeroinitializer + %8 = select i1 %3, <8 x i32> %6, <8 x i32> zeroinitializer + %9 = shl <8 x i32> %7, %8 + %10 = bitcast <8 x i32> %9 to <4 x i64> + ret <4 x i64> %10 +} + +define <8 x i64> @combine_scalar_shl_512(<8 x i64> %x, <8 x i64> %y) { +; SSE-LABEL: combine_scalar_shl_512: +; SSE: # %bb.0: +; SSE-NEXT: pslld %xmm4, %xmm0 +; SSE-NEXT: pslld %xmm4, %xmm1 +; SSE-NEXT: pslld %xmm4, %xmm2 +; SSE-NEXT: pslld %xmm4, %xmm3 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_shl_512: +; AVX: # %bb.0: +; AVX-NEXT: vpslld %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vpslld %xmm2, %ymm1, %ymm1 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_shl_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld %xmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %1 = bitcast <8 x i64> %x to <16 x i32> + %2 = extractelement <8 x i64> %y, i64 0 + %3 = icmp ult i64 %2, 32 + %4 = trunc i64 %2 to i32 + %5 = insertelement <16 x i32> undef, i32 %4, i32 0 + %6 = shufflevector <16 x i32> %5, <16 x i32> undef, <16 x i32> zeroinitializer + %7 = select i1 %3, <16 x i32> %1, <16 x i32> zeroinitializer + %8 = select i1 %3, <16 x i32> %6, <16 x i32> zeroinitializer + %9 = shl <16 x i32> %7, %8 + %10 = bitcast <16 x i32> %9 to <8 x i64> + ret <8 x i64> %10 +} Index: test/CodeGen/X86/combine-sra.ll =================================================================== --- test/CodeGen/X86/combine-sra.ll +++ test/CodeGen/X86/combine-sra.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512 ; fold (sra 0, x) -> 0 define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) { @@ -270,6 +271,368 @@ ret <4 x i32> %3 } +; fold (sra x, (min y, c)) -> (sra x, y) +; if c is vector of constants equal (bitsize - 1) of y elements +define <2 x i64> @combine_vec_ashr_min128(<2 x i64> %x, <2 x i64> %y) { +; SSE-LABEL: combine_vec_ashr_min128: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrad %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrad %xmm2, %xmm4 +; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrad %xmm1, %xmm2 +; SSE-NEXT: psrad %xmm3, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_vec_ashr_min128: +; AVX: # %bb.0: +; AVX-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_vec_ashr_min128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = bitcast <2 x i64> %x to <4 x i32> + %2 = bitcast <2 x i64> %y to <4 x i32> + %3 = icmp ult <4 x i32> %2, + %4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> + %5 = ashr <4 x i32> %1, %4 + %6 = bitcast <4 x i32> %5 to <2 x i64> + ret <2 x i64> %6 +} + +define <4 x i64> @combine_vec_ashr_min256(<4 x i64> %x, <4 x i64> %y) { +; SSE-LABEL: combine_vec_ashr_min256: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: psrad %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: psrad %xmm4, %xmm6 +; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: psrad %xmm2, %xmm7 +; SSE-NEXT: psrad %xmm5, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: psrad %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: psrad %xmm2, %xmm6 +; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrad %xmm3, %xmm4 +; SSE-NEXT: psrad %xmm2, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_vec_ashr_min256: +; AVX: # %bb.0: +; AVX-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_vec_ashr_min256: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %1 = bitcast <4 x i64> %x to <8 x i32> + %2 = bitcast <4 x i64> %y to <8 x i32> + %3 = icmp ult <8 x i32> %2, + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> + %5 = ashr <8 x i32> %1, %4 + %6 = bitcast <8 x i32> %5 to <4 x i64> + ret <4 x i64> %6 +} + +define <8 x i64> @combine_vec_ashr_min512(<8 x i64> %x, <8 x i64> %y) { +; SSE-LABEL: combine_vec_ashr_min512: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: psrad %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: psrad %xmm8, %xmm10 +; SSE-NEXT: pblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm9[4,5,6,7] +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: pmovzxdq {{.*#+}} xmm9 = xmm4[0],zero,xmm4[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: psrad %xmm4, %xmm11 +; SSE-NEXT: psrad %xmm9, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3],xmm0[4,5],xmm10[6,7] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: psrad %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: psrlq $32, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrad %xmm10, %xmm4 +; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm9[4,5,6,7] +; SSE-NEXT: pmovzxdq {{.*#+}} xmm9 = xmm5[0],zero,xmm5[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: psrad %xmm5, %xmm10 +; SSE-NEXT: psrad %xmm9, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm10[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: psrad %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: psrad %xmm4, %xmm5 +; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm9[4,5,6,7] +; SSE-NEXT: pmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psrad %xmm6, %xmm4 +; SSE-NEXT: psrad %xmm9, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: psrad %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: psrad %xmm4, %xmm6 +; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; SSE-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: psrad %xmm7, %xmm5 +; SSE-NEXT: psrad %xmm4, %xmm3 +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_vec_ashr_min512: +; AVX: # %bb.0: +; AVX-NEXT: vpsravd %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vpsravd %ymm3, %ymm1, %ymm1 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_vec_ashr_min512: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %1 = bitcast <8 x i64> %x to <16 x i32> + %2 = bitcast <8 x i64> %y to <16 x i32> + %3 = icmp ult <16 x i32> %2, + %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> + %5 = ashr <16 x i32> %1, %4 + %6 = bitcast <16 x i32> %5 to <8 x i64> + ret <8 x i64> %6 +} + +define <2 x i64> @combine_scalar_ashr_i_128(<2 x i64> %x, i32 %y) { +; SSE-LABEL: combine_scalar_ashr_i_128: +; SSE: # %bb.0: +; SSE-NEXT: movd %edi, %xmm1 +; SSE-NEXT: psrad %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_ashr_i_128: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %edi, %xmm1 +; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_ashr_i_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %edi, %xmm1 +; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = bitcast <2 x i64> %x to <4 x i32> + %2 = icmp ult i32 %y, 31 + %3 = select i1 %2, i32 %y, i32 31 + %4 = insertelement <4 x i32> undef, i32 %3, i32 0 + %5 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer + %6 = ashr <4 x i32> %1, %5 + %7 = bitcast <4 x i32> %6 to <2 x i64> + ret <2 x i64> %7 +} + +define <4 x i64> @combine_scalar_ashr_i_256(<4 x i64> %x, i32 %y) { +; SSE-LABEL: combine_scalar_ashr_i_256: +; SSE: # %bb.0: +; SSE-NEXT: movd %edi, %xmm2 +; SSE-NEXT: psrad %xmm2, %xmm0 +; SSE-NEXT: psrad %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_ashr_i_256: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %edi, %xmm1 +; AVX-NEXT: vpsrad %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_ashr_i_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %edi, %xmm1 +; AVX512-NEXT: vpsrad %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %1 = bitcast <4 x i64> %x to <8 x i32> + %2 = icmp ult i32 %y, 31 + %3 = select i1 %2, i32 %y, i32 31 + %4 = insertelement <8 x i32> undef, i32 %3, i32 0 + %5 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> zeroinitializer + %6 = ashr <8 x i32> %1, %5 + %7 = bitcast <8 x i32> %6 to <4 x i64> + ret <4 x i64> %7 +} + +define <8 x i64> @combine_scalar_ashr_i_512(<8 x i64> %x, i32 %y) { +; SSE-LABEL: combine_scalar_ashr_i_512: +; SSE: # %bb.0: +; SSE-NEXT: movd %edi, %xmm4 +; SSE-NEXT: psrad %xmm4, %xmm0 +; SSE-NEXT: psrad %xmm4, %xmm1 +; SSE-NEXT: psrad %xmm4, %xmm2 +; SSE-NEXT: psrad %xmm4, %xmm3 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_ashr_i_512: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %edi, %xmm2 +; AVX-NEXT: vpsrad %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vpsrad %xmm2, %ymm1, %ymm1 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_ashr_i_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %edi, %xmm1 +; AVX512-NEXT: vpsrad %xmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %1 = bitcast <8 x i64> %x to <16 x i32> + %2 = icmp ult i32 %y, 31 + %3 = select i1 %2, i32 %y, i32 31 + %4 = insertelement <16 x i32> undef, i32 %3, i32 0 + %5 = shufflevector <16 x i32> %4, <16 x i32> undef, <16 x i32> zeroinitializer + %6 = ashr <16 x i32> %1, %5 + %7 = bitcast <16 x i32> %6 to <8 x i64> + ret <8 x i64> %7 +} + +define <2 x i64> @combine_scalar_ashr_128(<2 x i64> %x, <2 x i64> %y) { +; SSE-LABEL: combine_scalar_ashr_128: +; SSE: # %bb.0: +; SSE-NEXT: psrad %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_ashr_128: +; AVX: # %bb.0: +; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_ashr_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = bitcast <2 x i64> %x to <4 x i32> + %2 = extractelement <2 x i64> %y, i64 0 + %3 = icmp ult i64 %2, 31 + %4 = select i1 %3, i64 %2, i64 31 + %5 = trunc i64 %4 to i32 + %6 = insertelement <4 x i32> undef, i32 %5, i32 0 + %7 = shufflevector <4 x i32> %6, <4 x i32> undef, <4 x i32> zeroinitializer + %8 = ashr <4 x i32> %1, %7 + %9 = bitcast <4 x i32> %8 to <2 x i64> + ret <2 x i64> %9 +} + +define <4 x i64> @combine_scalar_ashr_256(<4 x i64> %x, <4 x i64> %y) { +; SSE-LABEL: combine_scalar_ashr_256: +; SSE: # %bb.0: +; SSE-NEXT: psrad %xmm2, %xmm0 +; SSE-NEXT: psrad %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_ashr_256: +; AVX: # %bb.0: +; AVX-NEXT: vpsrad %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_ashr_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrad %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %1 = bitcast <4 x i64> %x to <8 x i32> + %2 = extractelement <4 x i64> %y, i64 0 + %3 = icmp ult i64 %2, 31 + %4 = select i1 %3, i64 %2, i64 31 + %5 = trunc i64 %4 to i32 + %6 = insertelement <8 x i32> undef, i32 %5, i32 0 + %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> zeroinitializer + %8 = ashr <8 x i32> %1, %7 + %9 = bitcast <8 x i32> %8 to <4 x i64> + ret <4 x i64> %9 +} + +define <8 x i64> @combine_scalar_ashr_512(<8 x i64> %x, <8 x i64> %y) { +; SSE-LABEL: combine_scalar_ashr_512: +; SSE: # %bb.0: +; SSE-NEXT: psrad %xmm4, %xmm0 +; SSE-NEXT: psrad %xmm4, %xmm1 +; SSE-NEXT: psrad %xmm4, %xmm2 +; SSE-NEXT: psrad %xmm4, %xmm3 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_ashr_512: +; AVX: # %bb.0: +; AVX-NEXT: vpsrad %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vpsrad %xmm2, %ymm1, %ymm1 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_ashr_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrad %xmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %1 = bitcast <8 x i64> %x to <16 x i32> + %2 = extractelement <8 x i64> %y, i64 0 + %3 = icmp ult i64 %2, 31 + %4 = select i1 %3, i64 %2, i64 31 + %5 = trunc i64 %4 to i32 + %6 = insertelement <16 x i32> undef, i32 %5, i32 0 + %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> zeroinitializer + %8 = ashr <16 x i32> %1, %7 + %9 = bitcast <16 x i32> %8 to <8 x i64> + ret <8 x i64> %9 +} + ; If the sign bit is known to be zero, switch this to a SRL. define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) { ; SSE-LABEL: combine_vec_ashr_positive: Index: test/CodeGen/X86/combine-srl.ll =================================================================== --- test/CodeGen/X86/combine-srl.ll +++ test/CodeGen/X86/combine-srl.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512 ; fold (srl 0, x) -> 0 define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) { @@ -466,3 +467,375 @@ %3 = lshr <4 x i32> %x, %2 ret <4 x i32> %3 } + +; fold (srl (select (setcc y, c, lt), xvec, zerovec), +; (select (setcc y, c, lt), yvec, zerovec)) +; -> (srl xvec, (smin yvec, maxvec)) +define <2 x i64> @combine_vec_srl_min128(<2 x i64> %x, <2 x i64> %y) { +; SSE-LABEL: combine_vec_srl_min128: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrld %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrld %xmm2, %xmm4 +; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrld %xmm1, %xmm2 +; SSE-NEXT: psrld %xmm3, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_vec_srl_min128: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_vec_srl_min128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = bitcast <2 x i64> %x to <4 x i32> + %2 = bitcast <2 x i64> %y to <4 x i32> + %3 = icmp ult <4 x i32> %2, + %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> zeroinitializer + %5 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer + %6 = lshr <4 x i32> %4, %5 + %7 = bitcast <4 x i32> %6 to <2 x i64> + ret <2 x i64> %7 +} + +define <4 x i64> @combine_vec_srl_min256(<4 x i64> %x, <4 x i64> %y) { +; SSE-LABEL: combine_vec_srl_min256: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: psrld %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: psrld %xmm4, %xmm6 +; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: psrld %xmm2, %xmm7 +; SSE-NEXT: psrld %xmm5, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: psrld %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: psrld %xmm2, %xmm6 +; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrld %xmm3, %xmm4 +; SSE-NEXT: psrld %xmm2, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_vec_srl_min256: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_vec_srl_min256: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %1 = bitcast <4 x i64> %x to <8 x i32> + %2 = bitcast <4 x i64> %y to <8 x i32> + %3 = icmp ult <8 x i32> %2, + %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> zeroinitializer + %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer + %6 = lshr <8 x i32> %4, %5 + %7 = bitcast <8 x i32> %6 to <4 x i64> + ret <4 x i64> %7 +} + +define <8 x i64> @combine_vec_srl_min512(<8 x i64> %x, <8 x i64> %y) { +; SSE-LABEL: combine_vec_srl_min512: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: psrld %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: psrld %xmm8, %xmm10 +; SSE-NEXT: pblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm9[4,5,6,7] +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: pmovzxdq {{.*#+}} xmm9 = xmm4[0],zero,xmm4[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: psrld %xmm4, %xmm11 +; SSE-NEXT: psrld %xmm9, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3],xmm0[4,5],xmm10[6,7] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: psrld %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: psrlq $32, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrld %xmm10, %xmm4 +; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm9[4,5,6,7] +; SSE-NEXT: pmovzxdq {{.*#+}} xmm9 = xmm5[0],zero,xmm5[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: psrld %xmm5, %xmm10 +; SSE-NEXT: psrld %xmm9, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm10[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: psrld %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: psrld %xmm4, %xmm5 +; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm9[4,5,6,7] +; SSE-NEXT: pmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psrld %xmm6, %xmm4 +; SSE-NEXT: psrld %xmm9, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: psrld %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: psrld %xmm4, %xmm6 +; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; SSE-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: psrld %xmm7, %xmm5 +; SSE-NEXT: psrld %xmm4, %xmm3 +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_vec_srl_min512: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vpsrlvd %ymm3, %ymm1, %ymm1 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_vec_srl_min512: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %1 = bitcast <8 x i64> %x to <16 x i32> + %2 = bitcast <8 x i64> %y to <16 x i32> + %3 = icmp ult <16 x i32> %2, + %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> zeroinitializer + %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer + %6 = lshr <16 x i32> %4, %5 + %7 = bitcast <16 x i32> %6 to <8 x i64> + ret <8 x i64> %7 +} + +define <2 x i64> @combine_scalar_srl_i_128(<2 x i64> %x, i32 %y) { +; SSE-LABEL: combine_scalar_srl_i_128: +; SSE: # %bb.0: +; SSE-NEXT: movd %edi, %xmm1 +; SSE-NEXT: psrld %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_srl_i_128: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %edi, %xmm1 +; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_srl_i_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %edi, %xmm1 +; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = bitcast <2 x i64> %x to <4 x i32> + %2 = icmp ult i32 %y, 32 + %3 = insertelement <4 x i32> undef, i32 %y, i32 0 + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer + %5 = select i1 %2, <4 x i32> %1, <4 x i32> zeroinitializer + %6 = select i1 %2, <4 x i32> %4, <4 x i32> zeroinitializer + %7 = lshr <4 x i32> %5, %6 + %8 = bitcast <4 x i32> %7 to <2 x i64> + ret <2 x i64> %8 +} + +define <4 x i64> @combine_scalar_srl_i_256(<4 x i64> %x, i32 %y) { +; SSE-LABEL: combine_scalar_srl_i_256: +; SSE: # %bb.0: +; SSE-NEXT: movd %edi, %xmm2 +; SSE-NEXT: psrld %xmm2, %xmm0 +; SSE-NEXT: psrld %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_srl_i_256: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %edi, %xmm1 +; AVX-NEXT: vpsrld %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_srl_i_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %edi, %xmm1 +; AVX512-NEXT: vpsrld %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %1 = bitcast <4 x i64> %x to <8 x i32> + %2 = icmp ult i32 %y, 32 + %3 = insertelement <8 x i32> undef, i32 %y, i32 0 + %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> zeroinitializer + %5 = select i1 %2, <8 x i32> %1, <8 x i32> zeroinitializer + %6 = select i1 %2, <8 x i32> %4, <8 x i32> zeroinitializer + %7 = lshr <8 x i32> %5, %6 + %8 = bitcast <8 x i32> %7 to <4 x i64> + ret <4 x i64> %8 +} + +define <8 x i64> @combine_scalar_srl_i_512(<8 x i64> %x, i32 %y) { +; SSE-LABEL: combine_scalar_srl_i_512: +; SSE: # %bb.0: +; SSE-NEXT: movd %edi, %xmm4 +; SSE-NEXT: psrld %xmm4, %xmm0 +; SSE-NEXT: psrld %xmm4, %xmm1 +; SSE-NEXT: psrld %xmm4, %xmm2 +; SSE-NEXT: psrld %xmm4, %xmm3 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_srl_i_512: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %edi, %xmm2 +; AVX-NEXT: vpsrld %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vpsrld %xmm2, %ymm1, %ymm1 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_srl_i_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %edi, %xmm1 +; AVX512-NEXT: vpsrld %xmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %1 = bitcast <8 x i64> %x to <16 x i32> + %2 = icmp ult i32 %y, 32 + %3 = insertelement <16 x i32> undef, i32 %y, i32 0 + %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> zeroinitializer + %5 = select i1 %2, <16 x i32> %1, <16 x i32> zeroinitializer + %6 = select i1 %2, <16 x i32> %4, <16 x i32> zeroinitializer + %7 = lshr <16 x i32> %5, %6 + %8 = bitcast <16 x i32> %7 to <8 x i64> + ret <8 x i64> %8 +} + +define <2 x i64> @combine_scalar_srl_128(<2 x i64> %x, <2 x i64> %y) { +; SSE-LABEL: combine_scalar_srl_128: +; SSE: # %bb.0: +; SSE-NEXT: psrld %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_srl_128: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_srl_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = bitcast <2 x i64> %x to <4 x i32> + %2 = extractelement <2 x i64> %y, i64 0 + %3 = icmp ult i64 %2, 32 + %4 = trunc i64 %2 to i32 + %5 = insertelement <4 x i32> undef, i32 %4, i32 0 + %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer + %7 = select i1 %3, <4 x i32> %1, <4 x i32> zeroinitializer + %8 = select i1 %3, <4 x i32> %6, <4 x i32> zeroinitializer + %9 = lshr <4 x i32> %7, %8 + %10 = bitcast <4 x i32> %9 to <2 x i64> + ret <2 x i64> %10 +} + +define <4 x i64> @combine_scalar_srl_256(<4 x i64> %x, <4 x i64> %y) { +; SSE-LABEL: combine_scalar_srl_256: +; SSE: # %bb.0: +; SSE-NEXT: psrld %xmm2, %xmm0 +; SSE-NEXT: psrld %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_srl_256: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_srl_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %1 = bitcast <4 x i64> %x to <8 x i32> + %2 = extractelement <4 x i64> %y, i64 0 + %3 = icmp ult i64 %2, 32 + %4 = trunc i64 %2 to i32 + %5 = insertelement <8 x i32> undef, i32 %4, i32 0 + %6 = shufflevector <8 x i32> %5, <8 x i32> undef, <8 x i32> zeroinitializer + %7 = select i1 %3, <8 x i32> %1, <8 x i32> zeroinitializer + %8 = select i1 %3, <8 x i32> %6, <8 x i32> zeroinitializer + %9 = lshr <8 x i32> %7, %8 + %10 = bitcast <8 x i32> %9 to <4 x i64> + ret <4 x i64> %10 +} + +define <8 x i64> @combine_scalar_srl_512(<8 x i64> %x, <8 x i64> %y) { +; SSE-LABEL: combine_scalar_srl_512: +; SSE: # %bb.0: +; SSE-NEXT: psrld %xmm4, %xmm0 +; SSE-NEXT: psrld %xmm4, %xmm1 +; SSE-NEXT: psrld %xmm4, %xmm2 +; SSE-NEXT: psrld %xmm4, %xmm3 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_scalar_srl_512: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vpsrld %xmm2, %ymm1, %ymm1 +; AVX-NEXT: retq +; +; AVX512-LABEL: combine_scalar_srl_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld %xmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %1 = bitcast <8 x i64> %x to <16 x i32> + %2 = extractelement <8 x i64> %y, i64 0 + %3 = icmp ult i64 %2, 32 + %4 = trunc i64 %2 to i32 + %5 = insertelement <16 x i32> undef, i32 %4, i32 0 + %6 = shufflevector <16 x i32> %5, <16 x i32> undef, <16 x i32> zeroinitializer + %7 = select i1 %3, <16 x i32> %1, <16 x i32> zeroinitializer + %8 = select i1 %3, <16 x i32> %6, <16 x i32> zeroinitializer + %9 = lshr <16 x i32> %7, %8 + %10 = bitcast <16 x i32> %9 to <8 x i64> + ret <8 x i64> %10 +} Index: test/Transforms/InstCombine/X86/x86-vector-shifts.ll =================================================================== --- test/Transforms/InstCombine/X86/x86-vector-shifts.ll +++ test/Transforms/InstCombine/X86/x86-vector-shifts.ll @@ -2675,13 +2675,153 @@ } ; +; ASHR Unknown Shift Vector +; + +define <2 x i64> @avx2_psrav_d_vec(<2 x i64> %v, <2 x i64> %a) { +; CHECK-LABEL: @avx2_psrav_d_vec( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> %v to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> %a to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <2 x i64> +; CHECK-NEXT: ret <2 x i64> [[TMP6]] +; + %1 = bitcast <2 x i64> %v to <4 x i32> + %2 = bitcast <2 x i64> %a to <4 x i32> + %3 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %1, <4 x i32> %2) + %4 = bitcast <4 x i32> %3 to <2 x i64> + ret <2 x i64> %4 +} + +define <8 x i32> @avx2_psrav_d_256_vec(<8 x i32> %v, <8 x i32> %a) { +; CHECK-LABEL: @avx2_psrav_d_256_vec( +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <8 x i32> %a, +; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> %a, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = ashr <8 x i32> %v, [[TMP2]] +; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; + %1 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> %a) + ret <8 x i32> %1 +} + +define <8 x i64> @avx512_psrav_d_512_vec(<8 x i64> %v, <8 x i64> %a) { +; CHECK-LABEL: @avx512_psrav_d_512_vec( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i64> %v to <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> %a to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <16 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = ashr <16 x i32> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP5]] to <8 x i64> +; CHECK-NEXT: ret <8 x i64> [[TMP6]] +; + %1 = bitcast <8 x i64> %v to <16 x i32> + %2 = bitcast <8 x i64> %a to <16 x i32> + %3 = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %1, <16 x i32> %2) + %4 = bitcast <16 x i32> %3 to <8 x i64> + ret <8 x i64> %4 +} + +define <2 x i64> @avx512_psrav_q_128_vec(<2 x i64> %v, <2 x i64> %a) { +; CHECK-LABEL: @avx512_psrav_q_128_vec( +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i64> %a, +; CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i64> %a, <2 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = ashr <2 x i64> %v, [[TMP2]] +; CHECK-NEXT: ret <2 x i64> [[TMP3]] +; + %1 = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %v, <2 x i64> %a) + ret <2 x i64> %1 +} + +define <4 x i64> @avx512_psrav_q_256_vec(<4 x i64> %v, <4 x i64> %a) { +; CHECK-LABEL: @avx512_psrav_q_256_vec( +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i64> %a, +; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> %a, <4 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = ashr <4 x i64> %v, [[TMP2]] +; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; + %1 = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %v, <4 x i64> %a) + ret <4 x i64> %1 +} + +define <8 x i64> @avx512_psrav_q_512_vec(<8 x i64> %v, <8 x i64> %a) { +; CHECK-LABEL: @avx512_psrav_q_512_vec( +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <8 x i64> %a, +; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> %a, <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = ashr <8 x i64> %v, [[TMP2]] +; ret <8 x i64> [[TMP3]] +; + %1 = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %v, <8 x i64> %a) + ret <8 x i64> %1 +} + +define <2 x i64> @avx512_psrav_w_128_vec(<2 x i64> %v, <2 x i64> %a) { +; CHECK-LABEL: @avx512_psrav_w_128_vec( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> %v to <8 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> %a to <8 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <8 x i16> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i16> [[TMP2]], <8 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = ashr <8 x i16> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <2 x i64> +; CHECK-NEXT: ret <2 x i64> [[TMP6]] +; + %1 = bitcast <2 x i64> %v to <8 x i16> + %2 = bitcast <2 x i64> %a to <8 x i16> + %3 = call <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16> %1, <8 x i16> %2) + %4 = bitcast <8 x i16> %3 to <2 x i64> + ret <2 x i64> %4 +} + +define <4 x i64> @avx512_psrav_w_256_vec(<4 x i64> %v, <4 x i64> %a) { +; CHECK_LABEL: @av512_psrav_w_256_vec( +; CHECK_NEXT: [[TMP1:%.*]] bitcast <4 x i64> %v to <16 x i16> +; CHECK_NEXT: [[TMP2:%.*]] = bitcast <4 x i64> %a to <16 x i16> +; CHECK_NEXT: [[TMP3:%.*]] = icmp ult <16 x i16> [[TMP2]], +; CHECK_NEXT: [[TMP4:%.*]] = select <16 x i1> [[TMP3]], <16 x i16> [[TMP2]], <16 x i16> +; CHECK_NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP4]] +; CHECK_NEXT: [[TMP6:%.*]] = bitcast <16 x i16> [[TMP5]] to <4 x i64> +; CHECK_NEXT: ret <4 x i64> [[TMP6]] +; + %1 = bitcast <4 x i64> %v to <16 x i16> + %2 = bitcast <4 x i64> %a to <16 x i16> + %3 = call <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16> %1, <16 x i16> %2) + %4 = bitcast <16 x i16> %3 to <4 x i64> + ret <4 x i64> %4 +} + +define <8 x i64> @avx512_psrav_w_512_vec(<8 x i64> %v, <8 x i64> %a) { +; CHECK-LABEL: @avx512_psrav_w_512_vec( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i64> %v to <32 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> %a to <32 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <32 x i16> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = select <32 x i1> [[TMP3]], <32 x i16> [[TMP2]], <32 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = ashr <32 x i16> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <32 x i16> [[TMP5]] to <8 x i64> +; CHECK-NEXT: ret <8 x i64> [[TMP6]] +; + %1 = bitcast <8 x i64> %v to <32 x i16> + %2 = bitcast <8 x i64> %a to <32 x i16> + %3 = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %1, <32 x i16> %2) + %4 = bitcast <32 x i16> %3 to <8 x i64> + ret <8 x i64> %4 +} + +; ; Vector Demanded Bits ; define <8 x i16> @sse2_psra_w_var(<8 x i16> %v, <8 x i16> %a) { ; CHECK-LABEL: @sse2_psra_w_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %a) -; CHECK-NEXT: ret <8 x i16> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 15 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 15 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i16 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[TMP5]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = ashr <8 x i16> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <8 x i16> [[TMP6]] ; %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1) @@ -2690,9 +2830,14 @@ define <8 x i16> @sse2_psra_w_var_bc(<8 x i16> %v, <2 x i64> %a) { ; CHECK-LABEL: @sse2_psra_w_var_bc( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> %a to <8 x i16> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> [[TMP1]]) -; CHECK-NEXT: ret <8 x i16> [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 15 +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 15 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i16 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = ashr <8 x i16> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <8 x i16> [[TMP5]] ; %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = bitcast <2 x i64> %1 to <8 x i16> @@ -2702,8 +2847,15 @@ define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) { ; CHECK-LABEL: @sse2_psra_d_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a) -; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 31 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 31 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP5]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = ashr <4 x i32> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <4 x i32> [[TMP6]] ; %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1) @@ -2712,9 +2864,15 @@ define <4 x i32> @sse2_psra_d_var_bc(<4 x i32> %v, <8 x i16> %a) { ; CHECK-LABEL: @sse2_psra_d_var_bc( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> %a to <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> [[TMP1]]) -; CHECK-NEXT: ret <4 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 31 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 31 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP5]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = ashr <4 x i32> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <4 x i32> [[TMP6]] ; %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %2 = bitcast <8 x i16> %1 to <4 x i32> @@ -2724,8 +2882,15 @@ define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) { ; CHECK-LABEL: @avx2_psra_w_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a) -; CHECK-NEXT: ret <16 x i16> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 15 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 15 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i16 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> undef, i16 [[TMP5]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <16 x i16> [[TMP6]] ; %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %1) @@ -2734,8 +2899,15 @@ define <8 x i32> @avx2_psra_d_var(<8 x i32> %v, <4 x i32> %a) { ; CHECK-LABEL: @avx2_psra_d_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %a) -; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 31 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 31 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = ashr <8 x i32> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <8 x i32> [[TMP6]] ; %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %1) @@ -2744,8 +2916,13 @@ define <2 x i64> @avx512_psra_q_128_var(<2 x i64> %v, <2 x i64> %a) { ; CHECK-LABEL: @avx512_psra_q_128_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> %a) -; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 63 +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 63 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = ashr <2 x i64> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <2 x i64> [[TMP4]] ; %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> %1) @@ -2754,8 +2931,13 @@ define <4 x i64> @avx512_psra_q_256_var(<4 x i64> %v, <2 x i64> %a) { ; CHECK-LABEL: @avx512_psra_q_256_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> %a) -; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 63 +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 63 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[TMP3]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = ashr <4 x i64> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <4 x i64> [[TMP4]] ; %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> %1) @@ -2764,8 +2946,15 @@ define <32 x i16> @avx512_psra_w_512_var(<32 x i16> %v, <8 x i16> %a) { ; CHECK-LABEL: @avx512_psra_w_512_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> %a) -; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 15 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 15 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i16 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <32 x i16> undef, i16 [[TMP5]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <32 x i16> [[DOTSPLATINSERT]], <32 x i16> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = ashr <32 x i16> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <32 x i16> [[TMP6]] ; %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %2 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> %1) @@ -2774,8 +2963,15 @@ define <16 x i32> @avx512_psra_d_512_var(<16 x i32> %v, <4 x i32> %a) { ; CHECK-LABEL: @avx512_psra_d_512_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> %a) -; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 31 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 31 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[TMP5]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = ashr <16 x i32> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <16 x i32> [[TMP6]] ; %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %2 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> %1) @@ -2784,8 +2980,13 @@ define <8 x i64> @avx512_psra_q_512_var(<8 x i64> %v, <2 x i64> %a) { ; CHECK-LABEL: @avx512_psra_q_512_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> %a) -; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 63 +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 63 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[TMP3]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = ashr <8 x i64> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <8 x i64> [[TMP4]] ; %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> %1) @@ -2794,8 +2995,16 @@ define <8 x i16> @sse2_psrl_w_var(<8 x i16> %v, <8 x i16> %a) { ; CHECK-LABEL: @sse2_psrl_w_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %a) -; CHECK-NEXT: ret <8 x i16> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i16 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], <8 x i16> [[V:%.*]], <8 x i16> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP3]], <8 x i16> [[DOTSPLAT]], <8 x i16> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = lshr <8 x i16> [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret <8 x i16> [[TMP7]] ; %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %1) @@ -2804,8 +3013,16 @@ define <4 x i32> @sse2_psrl_d_var(<4 x i32> %v, <4 x i32> %a) { ; CHECK-LABEL: @sse2_psrl_d_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %a) -; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], <4 x i32> [[V:%.*]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP3]], <4 x i32> [[DOTSPLAT]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = lshr <4 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret <4 x i32> [[TMP7]] ; %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %2 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %1) @@ -2814,8 +3031,13 @@ define <2 x i64> @sse2_psrl_q_var(<2 x i64> %v, <2 x i64> %a) { ; CHECK-LABEL: @sse2_psrl_q_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %a) -; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[A]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 64 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], <2 x i64> [[V:%.*]], <2 x i64> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], <2 x i64> [[TMP1]], <2 x i64> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: ret <2 x i64> [[TMP6]] ; %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %1) @@ -2824,8 +3046,16 @@ define <16 x i16> @avx2_psrl_w_var(<16 x i16> %v, <8 x i16> %a) { ; CHECK-LABEL: @avx2_psrl_w_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %a) -; CHECK-NEXT: ret <16 x i16> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i16 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], <16 x i16> [[V:%.*]], <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP3]], <16 x i16> [[DOTSPLAT]], <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = lshr <16 x i16> [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret <16 x i16> [[TMP7]] ; %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1) @@ -2834,9 +3064,16 @@ define <16 x i16> @avx2_psrl_w_var_bc(<16 x i16> %v, <16 x i8> %a) { ; CHECK-LABEL: @avx2_psrl_w_var_bc( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> %a to <8 x i16> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> [[TMP1]]) -; CHECK-NEXT: ret <16 x i16> [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i16 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], <16 x i16> [[V:%.*]], <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP3]], <16 x i16> [[DOTSPLAT]], <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = lshr <16 x i16> [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret <16 x i16> [[TMP7]] ; %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> %2 = bitcast <16 x i8> %1 to <8 x i16> @@ -2846,8 +3083,16 @@ define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) { ; CHECK-LABEL: @avx2_psrl_d_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a) -; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], <8 x i32> [[V:%.*]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP3]], <8 x i32> [[DOTSPLAT]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = lshr <8 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret <8 x i32> [[TMP7]] ; %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1) @@ -2856,9 +3101,15 @@ define <8 x i32> @avx2_psrl_d_var_bc(<8 x i32> %v, <2 x i64> %a) { ; CHECK-LABEL: @avx2_psrl_d_var_bc( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> %a to <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> [[TMP1]]) -; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 32 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], <8 x i32> [[V:%.*]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], <8 x i32> [[DOTSPLAT]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: ret <8 x i32> [[TMP6]] ; %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = bitcast <2 x i64> %1 to <4 x i32> @@ -2868,8 +3119,13 @@ define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) { ; CHECK-LABEL: @avx2_psrl_q_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a) -; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 64 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], <4 x i64> [[V:%.*]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], <4 x i64> [[DOTSPLAT]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP3]], [[TMP4]] +; CHECK-NEXT: ret <4 x i64> [[TMP5]] ; %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %1) @@ -2878,8 +3134,16 @@ define <32 x i16> @avx512_psrl_w_512_var(<32 x i16> %v, <8 x i16> %a) { ; CHECK-LABEL: @avx512_psrl_w_512_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %a) -; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i16 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <32 x i16> undef, i16 [[TMP4]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <32 x i16> [[DOTSPLATINSERT]], <32 x i16> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], <32 x i16> [[V:%.*]], <32 x i16> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP3]], <32 x i16> [[DOTSPLAT]], <32 x i16> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = lshr <32 x i16> [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret <32 x i16> [[TMP7]] ; %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %2 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %1) @@ -2888,9 +3152,16 @@ define <32 x i16> @avx512_psrl_w_512_var_bc(<32 x i16> %v, <16 x i8> %a) { ; CHECK-LABEL: @avx512_psrl_w_512_var_bc( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> %a to <8 x i16> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> [[TMP1]]) -; CHECK-NEXT: ret <32 x i16> [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i16 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <32 x i16> undef, i16 [[TMP4]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <32 x i16> [[DOTSPLATINSERT]], <32 x i16> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], <32 x i16> [[V:%.*]], <32 x i16> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP3]], <32 x i16> [[DOTSPLAT]], <32 x i16> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = lshr <32 x i16> [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret <32 x i16> [[TMP7]] ; %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> %2 = bitcast <16 x i8> %1 to <8 x i16> @@ -2900,8 +3171,16 @@ define <16 x i32> @avx512_psrl_d_512_var(<16 x i32> %v, <4 x i32> %a) { ; CHECK-LABEL: @avx512_psrl_d_512_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> %a) -; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[TMP4]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], <16 x i32> [[V:%.*]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP3]], <16 x i32> [[DOTSPLAT]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = lshr <16 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret <16 x i32> [[TMP7]] ; %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %2 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> %1) @@ -2910,9 +3189,15 @@ define <16 x i32> @avx512_psrl_d_512_var_bc(<16 x i32> %v, <2 x i64> %a) { ; CHECK-LABEL: @avx512_psrl_d_512_var_bc( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> %a to <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> [[TMP1]]) -; CHECK-NEXT: ret <16 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 32 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], <16 x i32> [[V:%.*]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], <16 x i32> [[DOTSPLAT]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = lshr <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: ret <16 x i32> [[TMP6]] ; %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = bitcast <2 x i64> %1 to <4 x i32> @@ -2922,8 +3207,13 @@ define <8 x i64> @avx512_psrl_q_512_var(<8 x i64> %v, <2 x i64> %a) { ; CHECK-LABEL: @avx512_psrl_q_512_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> %a) -; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 64 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], <8 x i64> [[V:%.*]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], <8 x i64> [[DOTSPLAT]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i64> [[TMP3]], [[TMP4]] +; CHECK-NEXT: ret <8 x i64> [[TMP5]] ; %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> %1) @@ -2932,8 +3222,16 @@ define <8 x i16> @sse2_psll_w_var(<8 x i16> %v, <8 x i16> %a) { ; CHECK-LABEL: @sse2_psll_w_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %a) -; CHECK-NEXT: ret <8 x i16> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i16 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], <8 x i16> [[V:%.*]], <8 x i16> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP3]], <8 x i16> [[DOTSPLAT]], <8 x i16> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = shl <8 x i16> [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret <8 x i16> [[TMP7]] ; %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %2 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %1) @@ -2942,8 +3240,16 @@ define <4 x i32> @sse2_psll_d_var(<4 x i32> %v, <4 x i32> %a) { ; CHECK-LABEL: @sse2_psll_d_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %a) -; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], <4 x i32> [[V:%.*]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP3]], <4 x i32> [[DOTSPLAT]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = shl <4 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret <4 x i32> [[TMP7]] ; %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %2 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %1) @@ -2952,8 +3258,13 @@ define <2 x i64> @sse2_psll_q_var(<2 x i64> %v, <2 x i64> %a) { ; CHECK-LABEL: @sse2_psll_q_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %a) -; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[A]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 64 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], <2 x i64> [[V:%.*]], <2 x i64> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], <2 x i64> [[TMP1]], <2 x i64> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = shl <2 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: ret <2 x i64> [[TMP6]] ; %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %1) @@ -2962,8 +3273,16 @@ define <16 x i16> @avx2_psll_w_var(<16 x i16> %v, <8 x i16> %a) { ; CHECK-LABEL: @avx2_psll_w_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %a) -; CHECK-NEXT: ret <16 x i16> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i16 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], <16 x i16> [[V:%.*]], <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP3]], <16 x i16> [[DOTSPLAT]], <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = shl <16 x i16> [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret <16 x i16> [[TMP7]] ; %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %2 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %1) @@ -2972,8 +3291,16 @@ define <8 x i32> @avx2_psll_d_var(<8 x i32> %v, <4 x i32> %a) { ; CHECK-LABEL: @avx2_psll_d_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %a) -; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], <8 x i32> [[V:%.*]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP3]], <8 x i32> [[DOTSPLAT]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = shl <8 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret <8 x i32> [[TMP7]] ; %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %2 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %1) @@ -2982,8 +3309,13 @@ define <4 x i64> @avx2_psll_q_var(<4 x i64> %v, <2 x i64> %a) { ; CHECK-LABEL: @avx2_psll_q_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %a) -; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 64 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], <4 x i64> [[V:%.*]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], <4 x i64> [[DOTSPLAT]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = shl <4 x i64> [[TMP3]], [[TMP4]] +; CHECK-NEXT: ret <4 x i64> [[TMP5]] ; %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %1) @@ -2992,8 +3324,16 @@ define <32 x i16> @avx512_psll_w_512_var(<32 x i16> %v, <8 x i16> %a) { ; CHECK-LABEL: @avx512_psll_w_512_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> %a) -; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i16 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <32 x i16> undef, i16 [[TMP4]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <32 x i16> [[DOTSPLATINSERT]], <32 x i16> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], <32 x i16> [[V:%.*]], <32 x i16> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP3]], <32 x i16> [[DOTSPLAT]], <32 x i16> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = shl <32 x i16> [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret <32 x i16> [[TMP7]] ; %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %2 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> %1) @@ -3002,8 +3342,16 @@ define <16 x i32> @avx512_psll_d_512_var(<16 x i32> %v, <4 x i32> %a) { ; CHECK-LABEL: @avx512_psll_d_512_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> %a) -; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[TMP4]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], <16 x i32> [[V:%.*]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP3]], <16 x i32> [[DOTSPLAT]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = shl <16 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret <16 x i32> [[TMP7]] ; %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %2 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> %1) @@ -3012,8 +3360,13 @@ define <8 x i64> @avx512_psll_q_512_var(<8 x i64> %v, <2 x i64> %a) { ; CHECK-LABEL: @avx512_psll_q_512_var( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> %a) -; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 64 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], <8 x i64> [[V:%.*]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], <8 x i64> [[DOTSPLAT]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = shl <8 x i64> [[TMP3]], [[TMP4]] +; CHECK-NEXT: ret <8 x i64> [[TMP5]] ; %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> %1)