Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -199,34 +199,65 @@ static Value *SimplifyX86immshift(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder, - bool ShiftLeft) { - // Simplify if count is constant. To 0 if >= BitWidth, - // otherwise to shl/lshr. - auto CDV = dyn_cast(II.getArgOperand(1)); - auto CInt = dyn_cast(II.getArgOperand(1)); - if (!CDV && !CInt) + bool LogicalShift, bool ShiftLeft) { + // Simplify if count is constant. + auto Arg1 = II.getArgOperand(1); + auto CAZ = dyn_cast(Arg1); + auto CDV = dyn_cast(Arg1); + auto CInt = dyn_cast(Arg1); + if (!CAZ && !CDV && !CInt) return nullptr; - ConstantInt *Count; - if (CDV) - Count = cast(CDV->getElementAsConstant(0)); - else - Count = CInt; + + APInt Count(64, 0); + if (CDV) { + // Shift amount vectors use the entire lower 64-bits integer. + auto VT = cast(CDV->getType()); + unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits(); + assert((64 % BitWidth) == 0 && "Unexpected packed shift size"); + unsigned NumSubElts = 64 / BitWidth; + + // Concatenate the sub-elements to create the 64-bit value. + for (unsigned i = 0; i != NumSubElts; ++i) { + unsigned SubEltIdx = (NumSubElts - 1) - i; + auto SubElt = cast(CDV->getElementAsConstant(SubEltIdx)); + Count = Count.shl(BitWidth); + Count |= SubElt->getValue().zextOrTrunc(64); + } + } + else if (CInt) + Count = CInt->getValue(); auto Vec = II.getArgOperand(0); auto VT = cast(Vec->getType()); auto SVT = VT->getElementType(); - if (Count->getZExtValue() > (SVT->getPrimitiveSizeInBits() - 1)) - return ConstantAggregateZero::get(VT); - unsigned VWidth = VT->getNumElements(); + unsigned BitWidth = SVT->getPrimitiveSizeInBits(); + + // If shift-by-zero then just return the original value. + if (Count == 0) + return Vec; + + // Handle cases when Shift >= BitWidth. + if (Count.uge(BitWidth)) { + // If LogicalShift - just return zero. + if (LogicalShift) + return ConstantAggregateZero::get(VT); + + // If ArithmeticShift - clamp Shift to (BitWidth - 1). + Count = APInt(64, BitWidth - 1); + } // Get a constant vector of the same type as the first operand. - auto VTCI = ConstantInt::get(VT->getElementType(), Count->getZExtValue()); + auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); + auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); if (ShiftLeft) - return Builder.CreateShl(Vec, Builder.CreateVectorSplat(VWidth, VTCI)); + return Builder.CreateShl(Vec, ShiftVec); + + if (LogicalShift) + return Builder.CreateLShr(Vec, ShiftVec); - return Builder.CreateLShr(Vec, Builder.CreateVectorSplat(VWidth, VTCI)); + return Builder.CreateAShr(Vec, ShiftVec); } static Value *SimplifyX86extend(const IntrinsicInst &II, @@ -753,6 +784,19 @@ break; } + // Constant fold ashr( , Ci ). + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + if (Value *V = SimplifyX86immshift(*II, *Builder, false, false)) + return ReplaceInstUsesWith(*II, V); + break; + // Constant fold lshr( , Ci ). case Intrinsic::x86_sse2_psrl_d: case Intrinsic::x86_sse2_psrl_q: @@ -766,7 +810,7 @@ case Intrinsic::x86_avx2_psrli_d: case Intrinsic::x86_avx2_psrli_q: case Intrinsic::x86_avx2_psrli_w: - if (Value *V = SimplifyX86immshift(*II, *Builder, false)) + if (Value *V = SimplifyX86immshift(*II, *Builder, true, false)) return ReplaceInstUsesWith(*II, V); break; @@ -783,7 +827,7 @@ case Intrinsic::x86_avx2_pslli_d: case Intrinsic::x86_avx2_pslli_q: case Intrinsic::x86_avx2_pslli_w: - if (Value *V = SimplifyX86immshift(*II, *Builder, true)) + if (Value *V = SimplifyX86immshift(*II, *Builder, true, true)) return ReplaceInstUsesWith(*II, V); break; Index: test/Transforms/InstCombine/x86-vector-shifts.ll =================================================================== --- test/Transforms/InstCombine/x86-vector-shifts.ll +++ test/Transforms/InstCombine/x86-vector-shifts.ll @@ -2,6 +2,102 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; +; ASHR - Immediate +; + +define <8 x i16> @sse2_psrai_w_0(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_w_0 +; CHECK: ret <8 x i16> %v + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 0) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psrai_w_15(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_w_15 +; CHECK: %1 = ashr <8 x i16> %v, +; CHECK: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 15) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psrai_w_64(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_w_64 +; CHECK: %1 = ashr <8 x i16> %v, +; CHECK: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 64) + ret <8 x i16> %1 +} + +define <4 x i32> @sse2_psrai_d_0(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_d_0 +; CHECK: ret <4 x i32> %v + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 0) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psrai_d_15(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_d_15 +; CHECK: %1 = ashr <4 x i32> %v, +; CHECK: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 15) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psrai_d_64(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_d_64 +; CHECK: %1 = ashr <4 x i32> %v, +; CHECK: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 64) + ret <4 x i32> %1 +} + +define <16 x i16> @avx2_psrai_w_0(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_w_0 +; CHECK: ret <16 x i16> %v + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 0) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psrai_w_15(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_w_15 +; CHECK: %1 = ashr <16 x i16> %v, +; CHECK: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 15) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psrai_w_64(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_w_64 +; CHECK: %1 = ashr <16 x i16> %v, +; CHECK: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 64) + ret <16 x i16> %1 +} + +define <8 x i32> @avx2_psrai_d_0(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_d_0 +; CHECK: ret <8 x i32> %v + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 0) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psrai_d_15(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_d_15 +; CHECK: %1 = ashr <8 x i32> %v, +; CHECK: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 15) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psrai_d_64(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_d_64 +; CHECK: %1 = ashr <8 x i32> %v, +; CHECK: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 64) + ret <8 x i32> %1 +} + +; ; LSHR - Immediate ; @@ -274,9 +370,144 @@ } ; +; ASHR - Constant Vector +; + +define <8 x i16> @sse2_psra_w_0(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_0 +; CHECK: ret <8 x i16> %v + %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> zeroinitializer) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psra_w_15(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_15 +; CHECK: %1 = ashr <8 x i16> %v, +; CHECK: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psra_w_15_splat(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_15_splat +; CHECK: %1 = ashr <8 x i16> %v, +; CHECK: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psra_w_64(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_64 +; CHECK: %1 = ashr <8 x i16> %v, +; CHECK: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <4 x i32> @sse2_psra_d_0(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_0 +; CHECK: ret <4 x i32> %v + %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> zeroinitializer) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psra_d_15(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_15 +; CHECK: %1 = ashr <4 x i32> %v, +; CHECK: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psra_d_15_splat(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_15_splat +; CHECK: %1 = ashr <4 x i32> %v, +; CHECK: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psra_d_64(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_64 +; CHECK: %1 = ashr <4 x i32> %v, +; CHECK: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <16 x i16> @avx2_psra_w_0(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_0 +; CHECK: ret <16 x i16> %v + %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> zeroinitializer) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psra_w_15(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_15 +; CHECK: %1 = ashr <16 x i16> %v, +; CHECK: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psra_w_15_splat(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_15_splat +; CHECK: %1 = ashr <16 x i16> %v, +; CHECK: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psra_w_64(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_64 +; CHECK: %1 = ashr <16 x i16> %v, +; CHECK: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <8 x i32> @avx2_psra_d_0(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_0 +; CHECK: ret <8 x i32> %v + %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> zeroinitializer) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psra_d_15(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_15 +; CHECK: %1 = ashr <8 x i32> %v, +; CHECK: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psra_d_15_splat(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_15_splat +; CHECK: %1 = ashr <8 x i32> %v, +; CHECK: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psra_d_64(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_64 +; CHECK: %1 = ashr <8 x i32> %v, +; CHECK: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +; ; LSHR - Constant Vector ; +define <8 x i16> @sse2_psrl_w_0(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_w_0 +; CHECK: ret <8 x i16> %v + %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> zeroinitializer) + ret <8 x i16> %1 +} + define <8 x i16> @sse2_psrl_w_15(<8 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrl_w_15 ; CHECK: %1 = lshr <8 x i16> %v, @@ -285,6 +516,13 @@ ret <8 x i16> %1 } +define <8 x i16> @sse2_psrl_w_15_splat(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_w_15_splat +; CHECK: ret <8 x i16> zeroinitializer + %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + define <8 x i16> @sse2_psrl_w_64(<8 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrl_w_64 ; CHECK: ret <8 x i16> zeroinitializer @@ -292,6 +530,13 @@ ret <8 x i16> %1 } +define <4 x i32> @sse2_psrl_d_0(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_d_0 +; CHECK: ret <4 x i32> %v + %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> zeroinitializer) + ret <4 x i32> %1 +} + define <4 x i32> @sse2_psrl_d_15(<4 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrl_d_15 ; CHECK: %1 = lshr <4 x i32> %v, @@ -300,6 +545,13 @@ ret <4 x i32> %1 } +define <4 x i32> @sse2_psrl_d_15_splat(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_d_15_splat +; CHECK: ret <4 x i32> zeroinitializer + %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + define <4 x i32> @sse2_psrl_d_64(<4 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrl_d_64 ; CHECK: ret <4 x i32> zeroinitializer @@ -307,6 +559,13 @@ ret <4 x i32> %1 } +define <2 x i64> @sse2_psrl_q_0(<2 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_q_0 +; CHECK: ret <2 x i64> %v + %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> zeroinitializer) + ret <2 x i64> %1 +} + define <2 x i64> @sse2_psrl_q_15(<2 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrl_q_15 ; CHECK: %1 = lshr <2 x i64> %v, @@ -322,6 +581,13 @@ ret <2 x i64> %1 } +define <16 x i16> @avx2_psrl_w_0(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_w_0 +; CHECK: ret <16 x i16> %v + %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> zeroinitializer) + ret <16 x i16> %1 +} + define <16 x i16> @avx2_psrl_w_15(<16 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrl_w_15 ; CHECK: %1 = lshr <16 x i16> %v, @@ -330,6 +596,13 @@ ret <16 x i16> %1 } +define <16 x i16> @avx2_psrl_w_15_splat(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_w_15_splat +; CHECK: ret <16 x i16> zeroinitializer + %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + define <16 x i16> @avx2_psrl_w_64(<16 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrl_w_64 ; CHECK: ret <16 x i16> zeroinitializer @@ -337,6 +610,13 @@ ret <16 x i16> %1 } +define <8 x i32> @avx2_psrl_d_0(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_d_0 +; CHECK: ret <8 x i32> %v + %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> zeroinitializer) + ret <8 x i32> %1 +} + define <8 x i32> @avx2_psrl_d_15(<8 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrl_d_15 ; CHECK: %1 = lshr <8 x i32> %v, @@ -345,6 +625,13 @@ ret <8 x i32> %1 } +define <8 x i32> @avx2_psrl_d_15_splat(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_d_15_splat +; CHECK: ret <8 x i32> zeroinitializer + %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + define <8 x i32> @avx2_psrl_d_64(<8 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrl_d_64 ; CHECK: ret <8 x i32> zeroinitializer @@ -352,6 +639,13 @@ ret <8 x i32> %1 } +define <4 x i64> @avx2_psrl_q_0(<4 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_q_0 +; CHECK: ret <4 x i64> %v + %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> zeroinitializer) + ret <4 x i64> %1 +} + define <4 x i64> @avx2_psrl_q_15(<4 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrl_q_15 ; CHECK: %1 = lshr <4 x i64> %v, @@ -371,6 +665,13 @@ ; SHL - Constant Vector ; +define <8 x i16> @sse2_psll_w_0(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_w_0 +; CHECK: ret <8 x i16> %v + %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> zeroinitializer) + ret <8 x i16> %1 +} + define <8 x i16> @sse2_psll_w_15(<8 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psll_w_15 ; CHECK: %1 = shl <8 x i16> %v, @@ -379,6 +680,13 @@ ret <8 x i16> %1 } +define <8 x i16> @sse2_psll_w_15_splat(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_w_15_splat +; CHECK: ret <8 x i16> zeroinitializer + %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + define <8 x i16> @sse2_psll_w_64(<8 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psll_w_64 ; CHECK: ret <8 x i16> zeroinitializer @@ -386,6 +694,13 @@ ret <8 x i16> %1 } +define <4 x i32> @sse2_psll_d_0(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_d_0 +; CHECK: ret <4 x i32> %v + %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> zeroinitializer) + ret <4 x i32> %1 +} + define <4 x i32> @sse2_psll_d_15(<4 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psll_d_15 ; CHECK: %1 = shl <4 x i32> %v, @@ -394,6 +709,13 @@ ret <4 x i32> %1 } +define <4 x i32> @sse2_psll_d_15_splat(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_d_15_splat +; CHECK: ret <4 x i32> zeroinitializer + %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + define <4 x i32> @sse2_psll_d_64(<4 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psll_d_64 ; CHECK: ret <4 x i32> zeroinitializer @@ -401,6 +723,13 @@ ret <4 x i32> %1 } +define <2 x i64> @sse2_psll_q_0(<2 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_q_0 +; CHECK: ret <2 x i64> %v + %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> zeroinitializer) + ret <2 x i64> %1 +} + define <2 x i64> @sse2_psll_q_15(<2 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psll_q_15 ; CHECK: %1 = shl <2 x i64> %v, @@ -416,6 +745,13 @@ ret <2 x i64> %1 } +define <16 x i16> @avx2_psll_w_0(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_w_0 +; CHECK: ret <16 x i16> %v + %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> zeroinitializer) + ret <16 x i16> %1 +} + define <16 x i16> @avx2_psll_w_15(<16 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psll_w_15 ; CHECK: %1 = shl <16 x i16> %v, @@ -424,6 +760,13 @@ ret <16 x i16> %1 } +define <16 x i16> @avx2_psll_w_15_splat(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_w_15_splat +; CHECK: ret <16 x i16> zeroinitializer + %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + define <16 x i16> @avx2_psll_w_64(<16 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psll_w_64 ; CHECK: ret <16 x i16> zeroinitializer @@ -431,6 +774,13 @@ ret <16 x i16> %1 } +define <8 x i32> @avx2_psll_d_0(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_d_0 +; CHECK: ret <8 x i32> %v + %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> zeroinitializer) + ret <8 x i32> %1 +} + define <8 x i32> @avx2_psll_d_15(<8 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psll_d_15 ; CHECK: %1 = shl <8 x i32> %v, @@ -439,6 +789,13 @@ ret <8 x i32> %1 } +define <8 x i32> @avx2_psll_d_15_splat(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_d_15_splat +; CHECK: ret <8 x i32> zeroinitializer + %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + define <8 x i32> @avx2_psll_d_64(<8 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psll_d_64 ; CHECK: ret <8 x i32> zeroinitializer @@ -446,6 +803,13 @@ ret <8 x i32> %1 } +define <4 x i64> @avx2_psll_q_0(<4 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_q_0 +; CHECK: ret <4 x i64> %v + %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> zeroinitializer) + ret <4 x i64> %1 +} + define <4 x i64> @avx2_psll_q_15(<4 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psll_q_15 ; CHECK: %1 = shl <4 x i64> %v, @@ -660,6 +1024,7 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1 declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1 + declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1 @@ -673,4 +1038,13 @@ declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1 declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1 +declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) #1 +declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) #1 +declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) #1 +declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) #1 +declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) #1 +declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) #1 +declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) #1 +declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) #1 + attributes #1 = { nounwind readnone }