Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -200,33 +200,56 @@ static Value *SimplifyX86immshift(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder, bool ShiftLeft) { - // Simplify if count is constant. To 0 if >= BitWidth, - // otherwise to shl/lshr. - auto CDV = dyn_cast(II.getArgOperand(1)); - auto CInt = dyn_cast(II.getArgOperand(1)); - if (!CDV && !CInt) + // Simplify if count is constant. + auto Arg1 = II.getArgOperand(1); + auto CAZ = dyn_cast(Arg1); + auto CDV = dyn_cast(Arg1); + auto CInt = dyn_cast(Arg1); + if (!CAZ && !CDV && !CInt) return nullptr; - ConstantInt *Count; - if (CDV) - Count = cast(CDV->getElementAsConstant(0)); - else - Count = CInt; + + APInt Count(64, 0); + if (CDV) { + // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + auto VT = cast(CDV->getType()); + unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits(); + assert((64 % BitWidth) == 0 && "Unexpected packed shift size"); + unsigned NumSubElts = 64 / BitWidth; + + // Concatenate the sub-elements to create the 64-bit value. + for (unsigned i = 0; i != NumSubElts; ++i) { + unsigned SubEltIdx = (NumSubElts - 1) - i; + auto SubElt = cast(CDV->getElementAsConstant(SubEltIdx)); + Count = Count.shl(BitWidth); + Count |= SubElt->getValue().zextOrTrunc(64); + } + } + else if (CInt) + Count = CInt->getValue(); auto Vec = II.getArgOperand(0); auto VT = cast(Vec->getType()); auto SVT = VT->getElementType(); - if (Count->getZExtValue() > (SVT->getPrimitiveSizeInBits() - 1)) - return ConstantAggregateZero::get(VT); - unsigned VWidth = VT->getNumElements(); + unsigned BitWidth = SVT->getPrimitiveSizeInBits(); + + // If shift-by-zero then just return the original value. + if (Count == 0) + return Vec; + + // Handle cases when Shift >= BitWidth - just return zero. + if (Count.uge(BitWidth)) + return ConstantAggregateZero::get(VT); // Get a constant vector of the same type as the first operand. - auto VTCI = ConstantInt::get(VT->getElementType(), Count->getZExtValue()); + auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); + auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); if (ShiftLeft) - return Builder.CreateShl(Vec, Builder.CreateVectorSplat(VWidth, VTCI)); + return Builder.CreateShl(Vec, ShiftVec); - return Builder.CreateLShr(Vec, Builder.CreateVectorSplat(VWidth, VTCI)); + return Builder.CreateLShr(Vec, ShiftVec); } static Value *SimplifyX86extend(const IntrinsicInst &II, Index: llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll +++ llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll @@ -7,132 +7,132 @@ define <8 x i16> @sse2_psrli_w_0(<8 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrli_w_0 -; CHECK: ret <8 x i16> %v +; CHECK-NEXT: ret <8 x i16> %v %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 0) ret <8 x i16> %1 } define <8 x i16> @sse2_psrli_w_15(<8 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrli_w_15 -; CHECK: %1 = lshr <8 x i16> %v, -; CHECK: ret <8 x i16> %1 +; CHECK-NEXT: %1 = lshr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 15) ret <8 x i16> %1 } define <8 x i16> @sse2_psrli_w_64(<8 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrli_w_64 -; CHECK: ret <8 x i16> zeroinitializer +; CHECK-NEXT: ret <8 x i16> zeroinitializer %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 64) ret <8 x i16> %1 } define <4 x i32> @sse2_psrli_d_0(<4 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrli_d_0 -; CHECK: ret <4 x i32> %v +; CHECK-NEXT: ret <4 x i32> %v %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 0) ret <4 x i32> %1 } define <4 x i32> @sse2_psrli_d_15(<4 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrli_d_15 -; CHECK: %1 = lshr <4 x i32> %v, -; CHECK: ret <4 x i32> %1 +; CHECK-NEXT: %1 = lshr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 15) ret <4 x i32> %1 } define <4 x i32> @sse2_psrli_d_64(<4 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrli_d_64 -; CHECK: ret <4 x i32> zeroinitializer +; CHECK-NEXT: ret <4 x i32> zeroinitializer %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 64) ret <4 x i32> %1 } define <2 x i64> @sse2_psrli_q_0(<2 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrli_q_0 -; CHECK: ret <2 x i64> %v +; CHECK-NEXT: ret <2 x i64> %v %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %v, i32 0) ret <2 x i64> %1 } define <2 x i64> @sse2_psrli_q_15(<2 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrli_q_15 -; CHECK: %1 = lshr <2 x i64> %v, -; CHECK: ret <2 x i64> %1 +; CHECK-NEXT: %1 = lshr <2 x i64> %v, +; CHECK-NEXT: ret <2 x i64> %1 %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %v, i32 15) ret <2 x i64> %1 } define <2 x i64> @sse2_psrli_q_64(<2 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrli_q_64 -; CHECK: ret <2 x i64> zeroinitializer +; CHECK-NEXT: ret <2 x i64> zeroinitializer %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %v, i32 64) ret <2 x i64> %1 } define <16 x i16> @avx2_psrli_w_0(<16 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrli_w_0 -; CHECK: ret <16 x i16> %v +; CHECK-NEXT: ret <16 x i16> %v %1 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %v, i32 0) ret <16 x i16> %1 } define <16 x i16> @avx2_psrli_w_15(<16 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrli_w_15 -; CHECK: %1 = lshr <16 x i16> %v, -; CHECK: ret <16 x i16> %1 +; CHECK-NEXT: %1 = lshr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 %1 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %v, i32 15) ret <16 x i16> %1 } define <16 x i16> @avx2_psrli_w_64(<16 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrli_w_64 -; CHECK: ret <16 x i16> zeroinitializer +; CHECK-NEXT: ret <16 x i16> zeroinitializer %1 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %v, i32 64) ret <16 x i16> %1 } define <8 x i32> @avx2_psrli_d_0(<8 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrli_d_0 -; CHECK: ret <8 x i32> %v +; CHECK-NEXT: ret <8 x i32> %v %1 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %v, i32 0) ret <8 x i32> %1 } define <8 x i32> @avx2_psrli_d_15(<8 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrli_d_15 -; CHECK: %1 = lshr <8 x i32> %v, -; CHECK: ret <8 x i32> %1 +; CHECK-NEXT: %1 = lshr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 %1 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %v, i32 15) ret <8 x i32> %1 } define <8 x i32> @avx2_psrli_d_64(<8 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrli_d_64 -; CHECK: ret <8 x i32> zeroinitializer +; CHECK-NEXT: ret <8 x i32> zeroinitializer %1 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %v, i32 64) ret <8 x i32> %1 } define <4 x i64> @avx2_psrli_q_0(<4 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrli_q_0 -; CHECK: ret <4 x i64> %v +; CHECK-NEXT: ret <4 x i64> %v %1 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 0) ret <4 x i64> %1 } define <4 x i64> @avx2_psrli_q_15(<4 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrli_q_15 -; CHECK: %1 = lshr <4 x i64> %v, -; CHECK: ret <4 x i64> %1 +; CHECK-NEXT: %1 = lshr <4 x i64> %v, +; CHECK-NEXT: ret <4 x i64> %1 %1 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 15) ret <4 x i64> %1 } define <4 x i64> @avx2_psrli_q_64(<4 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrli_q_64 -; CHECK: ret <4 x i64> zeroinitializer +; CHECK-NEXT: ret <4 x i64> zeroinitializer %1 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 64) ret <4 x i64> %1 } @@ -143,132 +143,132 @@ define <8 x i16> @sse2_pslli_w_0(<8 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_pslli_w_0 -; CHECK: ret <8 x i16> %v +; CHECK-NEXT: ret <8 x i16> %v %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %v, i32 0) ret <8 x i16> %1 } define <8 x i16> @sse2_pslli_w_15(<8 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_pslli_w_15 -; CHECK: %1 = shl <8 x i16> %v, -; CHECK: ret <8 x i16> %1 +; CHECK-NEXT: %1 = shl <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %v, i32 15) ret <8 x i16> %1 } define <8 x i16> @sse2_pslli_w_64(<8 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_pslli_w_64 -; CHECK: ret <8 x i16> zeroinitializer +; CHECK-NEXT: ret <8 x i16> zeroinitializer %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %v, i32 64) ret <8 x i16> %1 } define <4 x i32> @sse2_pslli_d_0(<4 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_pslli_d_0 -; CHECK: ret <4 x i32> %v +; CHECK-NEXT: ret <4 x i32> %v %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %v, i32 0) ret <4 x i32> %1 } define <4 x i32> @sse2_pslli_d_15(<4 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_pslli_d_15 -; CHECK: %1 = shl <4 x i32> %v, -; CHECK: ret <4 x i32> %1 +; CHECK-NEXT: %1 = shl <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %v, i32 15) ret <4 x i32> %1 } define <4 x i32> @sse2_pslli_d_64(<4 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_pslli_d_64 -; CHECK: ret <4 x i32> zeroinitializer +; CHECK-NEXT: ret <4 x i32> zeroinitializer %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %v, i32 64) ret <4 x i32> %1 } define <2 x i64> @sse2_pslli_q_0(<2 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_pslli_q_0 -; CHECK: ret <2 x i64> %v +; CHECK-NEXT: ret <2 x i64> %v %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 0) ret <2 x i64> %1 } define <2 x i64> @sse2_pslli_q_15(<2 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_pslli_q_15 -; CHECK: %1 = shl <2 x i64> %v, -; CHECK: ret <2 x i64> %1 +; CHECK-NEXT: %1 = shl <2 x i64> %v, +; CHECK-NEXT: ret <2 x i64> %1 %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 15) ret <2 x i64> %1 } define <2 x i64> @sse2_pslli_q_64(<2 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_pslli_q_64 -; CHECK: ret <2 x i64> zeroinitializer +; CHECK-NEXT: ret <2 x i64> zeroinitializer %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 64) ret <2 x i64> %1 } define <16 x i16> @avx2_pslli_w_0(<16 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_pslli_w_0 -; CHECK: ret <16 x i16> %v +; CHECK-NEXT: ret <16 x i16> %v %1 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 0) ret <16 x i16> %1 } define <16 x i16> @avx2_pslli_w_15(<16 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_pslli_w_15 -; CHECK: %1 = shl <16 x i16> %v, -; CHECK: ret <16 x i16> %1 +; CHECK-NEXT: %1 = shl <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 %1 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 15) ret <16 x i16> %1 } define <16 x i16> @avx2_pslli_w_64(<16 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_pslli_w_64 -; CHECK: ret <16 x i16> zeroinitializer +; CHECK-NEXT: ret <16 x i16> zeroinitializer %1 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 64) ret <16 x i16> %1 } define <8 x i32> @avx2_pslli_d_0(<8 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_pslli_d_0 -; CHECK: ret <8 x i32> %v +; CHECK-NEXT: ret <8 x i32> %v %1 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %v, i32 0) ret <8 x i32> %1 } define <8 x i32> @avx2_pslli_d_15(<8 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_pslli_d_15 -; CHECK: %1 = shl <8 x i32> %v, -; CHECK: ret <8 x i32> %1 +; CHECK-NEXT: %1 = shl <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 %1 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %v, i32 15) ret <8 x i32> %1 } define <8 x i32> @avx2_pslli_d_64(<8 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_pslli_d_64 -; CHECK: ret <8 x i32> zeroinitializer +; CHECK-NEXT: ret <8 x i32> zeroinitializer %1 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %v, i32 64) ret <8 x i32> %1 } define <4 x i64> @avx2_pslli_q_0(<4 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_pslli_q_0 -; CHECK: ret <4 x i64> %v +; CHECK-NEXT: ret <4 x i64> %v %1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 0) ret <4 x i64> %1 } define <4 x i64> @avx2_pslli_q_15(<4 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_pslli_q_15 -; CHECK: %1 = shl <4 x i64> %v, -; CHECK: ret <4 x i64> %1 +; CHECK-NEXT: %1 = shl <4 x i64> %v, +; CHECK-NEXT: ret <4 x i64> %1 %1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 15) ret <4 x i64> %1 } define <4 x i64> @avx2_pslli_q_64(<4 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_pslli_q_64 -; CHECK: ret <4 x i64> zeroinitializer +; CHECK-NEXT: ret <4 x i64> zeroinitializer %1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 64) ret <4 x i64> %1 } @@ -277,92 +277,162 @@ ; LSHR - Constant Vector ; +define <8 x i16> @sse2_psrl_w_0(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_w_0 +; CHECK-NEXT: ret <8 x i16> %v + %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> zeroinitializer) + ret <8 x i16> %1 +} + define <8 x i16> @sse2_psrl_w_15(<8 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrl_w_15 -; CHECK: %1 = lshr <8 x i16> %v, -; CHECK: ret <8 x i16> %1 +; CHECK-NEXT: %1 = lshr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> ) ret <8 x i16> %1 } +define <8 x i16> @sse2_psrl_w_15_splat(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_w_15_splat +; CHECK-NEXT: ret <8 x i16> zeroinitializer + %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + define <8 x i16> @sse2_psrl_w_64(<8 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrl_w_64 -; CHECK: ret <8 x i16> zeroinitializer +; CHECK-NEXT: ret <8 x i16> zeroinitializer %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> ) ret <8 x i16> %1 } +define <4 x i32> @sse2_psrl_d_0(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_d_0 +; CHECK-NEXT: ret <4 x i32> %v + %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> zeroinitializer) + ret <4 x i32> %1 +} + define <4 x i32> @sse2_psrl_d_15(<4 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrl_d_15 -; CHECK: %1 = lshr <4 x i32> %v, -; CHECK: ret <4 x i32> %1 +; CHECK-NEXT: %1 = lshr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> ) ret <4 x i32> %1 } +define <4 x i32> @sse2_psrl_d_15_splat(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_d_15_splat +; CHECK-NEXT: ret <4 x i32> zeroinitializer + %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + define <4 x i32> @sse2_psrl_d_64(<4 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrl_d_64 -; CHECK: ret <4 x i32> zeroinitializer +; CHECK-NEXT: ret <4 x i32> zeroinitializer %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> ) ret <4 x i32> %1 } +define <2 x i64> @sse2_psrl_q_0(<2 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_q_0 +; CHECK-NEXT: ret <2 x i64> %v + %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> zeroinitializer) + ret <2 x i64> %1 +} + define <2 x i64> @sse2_psrl_q_15(<2 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrl_q_15 -; CHECK: %1 = lshr <2 x i64> %v, -; CHECK: ret <2 x i64> %1 +; CHECK-NEXT: %1 = lshr <2 x i64> %v, +; CHECK-NEXT: ret <2 x i64> %1 %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> ) ret <2 x i64> %1 } define <2 x i64> @sse2_psrl_q_64(<2 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psrl_q_64 -; CHECK: ret <2 x i64> zeroinitializer +; CHECK-NEXT: ret <2 x i64> zeroinitializer %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> ) ret <2 x i64> %1 } +define <16 x i16> @avx2_psrl_w_0(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_w_0 +; CHECK-NEXT: ret <16 x i16> %v + %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> zeroinitializer) + ret <16 x i16> %1 +} + define <16 x i16> @avx2_psrl_w_15(<16 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrl_w_15 -; CHECK: %1 = lshr <16 x i16> %v, -; CHECK: ret <16 x i16> %1 +; CHECK-NEXT: %1 = lshr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> ) ret <16 x i16> %1 } +define <16 x i16> @avx2_psrl_w_15_splat(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_w_15_splat +; CHECK-NEXT: ret <16 x i16> zeroinitializer + %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + define <16 x i16> @avx2_psrl_w_64(<16 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrl_w_64 -; CHECK: ret <16 x i16> zeroinitializer +; CHECK-NEXT: ret <16 x i16> zeroinitializer %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> ) ret <16 x i16> %1 } +define <8 x i32> @avx2_psrl_d_0(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_d_0 +; CHECK-NEXT: ret <8 x i32> %v + %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> zeroinitializer) + ret <8 x i32> %1 +} + define <8 x i32> @avx2_psrl_d_15(<8 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrl_d_15 -; CHECK: %1 = lshr <8 x i32> %v, -; CHECK: ret <8 x i32> %1 +; CHECK-NEXT: %1 = lshr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> ) ret <8 x i32> %1 } +define <8 x i32> @avx2_psrl_d_15_splat(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_d_15_splat +; CHECK-NEXT: ret <8 x i32> zeroinitializer + %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + define <8 x i32> @avx2_psrl_d_64(<8 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrl_d_64 -; CHECK: ret <8 x i32> zeroinitializer +; CHECK-NEXT: ret <8 x i32> zeroinitializer %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> ) ret <8 x i32> %1 } +define <4 x i64> @avx2_psrl_q_0(<4 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_q_0 +; CHECK-NEXT: ret <4 x i64> %v + %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> zeroinitializer) + ret <4 x i64> %1 +} + define <4 x i64> @avx2_psrl_q_15(<4 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrl_q_15 -; CHECK: %1 = lshr <4 x i64> %v, -; CHECK: ret <4 x i64> %1 +; CHECK-NEXT: %1 = lshr <4 x i64> %v, +; CHECK-NEXT: ret <4 x i64> %1 %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> ) ret <4 x i64> %1 } define <4 x i64> @avx2_psrl_q_64(<4 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psrl_q_64 -; CHECK: ret <4 x i64> zeroinitializer +; CHECK-NEXT: ret <4 x i64> zeroinitializer %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> ) ret <4 x i64> %1 } @@ -371,92 +441,162 @@ ; SHL - Constant Vector ; +define <8 x i16> @sse2_psll_w_0(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_w_0 +; CHECK-NEXT: ret <8 x i16> %v + %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> zeroinitializer) + ret <8 x i16> %1 +} + define <8 x i16> @sse2_psll_w_15(<8 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psll_w_15 -; CHECK: %1 = shl <8 x i16> %v, -; CHECK: ret <8 x i16> %1 +; CHECK-NEXT: %1 = shl <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> ) ret <8 x i16> %1 } +define <8 x i16> @sse2_psll_w_15_splat(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_w_15_splat +; CHECK-NEXT: ret <8 x i16> zeroinitializer + %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + define <8 x i16> @sse2_psll_w_64(<8 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psll_w_64 -; CHECK: ret <8 x i16> zeroinitializer +; CHECK-NEXT: ret <8 x i16> zeroinitializer %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> ) ret <8 x i16> %1 } +define <4 x i32> @sse2_psll_d_0(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_d_0 +; CHECK-NEXT: ret <4 x i32> %v + %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> zeroinitializer) + ret <4 x i32> %1 +} + define <4 x i32> @sse2_psll_d_15(<4 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psll_d_15 -; CHECK: %1 = shl <4 x i32> %v, -; CHECK: ret <4 x i32> %1 +; CHECK-NEXT: %1 = shl <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> ) ret <4 x i32> %1 } +define <4 x i32> @sse2_psll_d_15_splat(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_d_15_splat +; CHECK-NEXT: ret <4 x i32> zeroinitializer + %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + define <4 x i32> @sse2_psll_d_64(<4 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psll_d_64 -; CHECK: ret <4 x i32> zeroinitializer +; CHECK-NEXT: ret <4 x i32> zeroinitializer %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> ) ret <4 x i32> %1 } +define <2 x i64> @sse2_psll_q_0(<2 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_q_0 +; CHECK-NEXT: ret <2 x i64> %v + %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> zeroinitializer) + ret <2 x i64> %1 +} + define <2 x i64> @sse2_psll_q_15(<2 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psll_q_15 -; CHECK: %1 = shl <2 x i64> %v, -; CHECK: ret <2 x i64> %1 +; CHECK-NEXT: %1 = shl <2 x i64> %v, +; CHECK-NEXT: ret <2 x i64> %1 %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> ) ret <2 x i64> %1 } define <2 x i64> @sse2_psll_q_64(<2 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @sse2_psll_q_64 -; CHECK: ret <2 x i64> zeroinitializer +; CHECK-NEXT: ret <2 x i64> zeroinitializer %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> ) ret <2 x i64> %1 } +define <16 x i16> @avx2_psll_w_0(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_w_0 +; CHECK-NEXT: ret <16 x i16> %v + %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> zeroinitializer) + ret <16 x i16> %1 +} + define <16 x i16> @avx2_psll_w_15(<16 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psll_w_15 -; CHECK: %1 = shl <16 x i16> %v, -; CHECK: ret <16 x i16> %1 +; CHECK-NEXT: %1 = shl <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> ) ret <16 x i16> %1 } +define <16 x i16> @avx2_psll_w_15_splat(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_w_15_splat +; CHECK-NEXT: ret <16 x i16> zeroinitializer + %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + define <16 x i16> @avx2_psll_w_64(<16 x i16> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psll_w_64 -; CHECK: ret <16 x i16> zeroinitializer +; CHECK-NEXT: ret <16 x i16> zeroinitializer %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> ) ret <16 x i16> %1 } +define <8 x i32> @avx2_psll_d_0(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_d_0 +; CHECK-NEXT: ret <8 x i32> %v + %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> zeroinitializer) + ret <8 x i32> %1 +} + define <8 x i32> @avx2_psll_d_15(<8 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psll_d_15 -; CHECK: %1 = shl <8 x i32> %v, -; CHECK: ret <8 x i32> %1 +; CHECK-NEXT: %1 = shl <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> ) ret <8 x i32> %1 } +define <8 x i32> @avx2_psll_d_15_splat(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_d_15_splat +; CHECK-NEXT: ret <8 x i32> zeroinitializer + %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + define <8 x i32> @avx2_psll_d_64(<8 x i32> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psll_d_64 -; CHECK: ret <8 x i32> zeroinitializer +; CHECK-NEXT: ret <8 x i32> zeroinitializer %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> ) ret <8 x i32> %1 } +define <4 x i64> @avx2_psll_q_0(<4 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_q_0 +; CHECK-NEXT: ret <4 x i64> %v + %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> zeroinitializer) + ret <4 x i64> %1 +} + define <4 x i64> @avx2_psll_q_15(<4 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psll_q_15 -; CHECK: %1 = shl <4 x i64> %v, -; CHECK: ret <4 x i64> %1 +; CHECK-NEXT: %1 = shl <4 x i64> %v, +; CHECK-NEXT: ret <4 x i64> %1 %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> ) ret <4 x i64> %1 } define <4 x i64> @avx2_psll_q_64(<4 x i64> %v) nounwind readnone uwtable { ; CHECK-LABEL: @avx2_psll_q_64 -; CHECK: ret <4 x i64> zeroinitializer +; CHECK-NEXT: ret <4 x i64> zeroinitializer %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> ) ret <4 x i64> %1 } @@ -660,6 +800,7 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1 declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1 + declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1