Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -23466,50 +23466,6 @@ return SDValue(); } - - // Packed SSE2/AVX2 arithmetic shift immediate intrinsics. - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx2_psra_d: { - SDValue Op0 = N->getOperand(1); - SDValue Op1 = N->getOperand(2); - EVT VT = Op0.getValueType(); - assert(VT.isVector() && "Expected a vector type!"); - - if (isa(Op1)) - Op1 = Op1.getOperand(0); - - if (!isa(Op1)) - return SDValue(); - - EVT SVT = VT.getVectorElementType(); - unsigned SVTBits = SVT.getSizeInBits(); - - ConstantSDNode *CND = cast(Op1); - const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue()); - uint64_t ShAmt = C.getZExtValue(); - - // Don't try to convert this shift into a ISD::SRA if the shift - // count is bigger than or equal to the element size. - if (ShAmt >= SVTBits) - return SDValue(); - - // Trivial case: if the shift count is zero, then fold this - // into the first operand. - if (ShAmt == 0) - return Op0; - - // Replace this packed shift intrinsic with a target independent - // shift dag node. - SDLoc DL(N); - SDValue Splat = DAG.getConstant(C, DL, VT); - return DAG.getNode(ISD::SRA, DL, VT, Op0, Splat); - } } } Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -199,7 +199,9 @@ static Value *SimplifyX86immshift(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder, - bool ShiftLeft) { + bool LogicalShift, bool ShiftLeft) { + assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); + // Simplify if count is constant. auto Arg1 = II.getArgOperand(1); auto CAZ = dyn_cast(Arg1); @@ -238,9 +240,15 @@ if (Count == 0) return Vec; - // Handle cases when Shift >= BitWidth - just return zero. - if (Count.uge(BitWidth)) - return ConstantAggregateZero::get(VT); + // Handle cases when Shift >= BitWidth. + if (Count.uge(BitWidth)) { + // If LogicalShift - just return zero. + if (LogicalShift) + return ConstantAggregateZero::get(VT); + + // If ArithmeticShift - clamp Shift to (BitWidth - 1). + Count = APInt(64, BitWidth - 1); + } // Get a constant vector of the same type as the first operand. auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); @@ -249,7 +257,10 @@ if (ShiftLeft) return Builder.CreateShl(Vec, ShiftVec); - return Builder.CreateLShr(Vec, ShiftVec); + if (LogicalShift) + return Builder.CreateLShr(Vec, ShiftVec); + + return Builder.CreateAShr(Vec, ShiftVec); } static Value *SimplifyX86extend(const IntrinsicInst &II, @@ -776,6 +787,19 @@ break; } + // Constant fold ashr( , Ci ). + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + if (Value *V = SimplifyX86immshift(*II, *Builder, false, false)) + return ReplaceInstUsesWith(*II, V); + break; + // Constant fold lshr( , Ci ). case Intrinsic::x86_sse2_psrl_d: case Intrinsic::x86_sse2_psrl_q: @@ -789,7 +813,7 @@ case Intrinsic::x86_avx2_psrli_d: case Intrinsic::x86_avx2_psrli_q: case Intrinsic::x86_avx2_psrli_w: - if (Value *V = SimplifyX86immshift(*II, *Builder, false)) + if (Value *V = SimplifyX86immshift(*II, *Builder, true, false)) return ReplaceInstUsesWith(*II, V); break; @@ -806,7 +830,7 @@ case Intrinsic::x86_avx2_pslli_d: case Intrinsic::x86_avx2_pslli_q: case Intrinsic::x86_avx2_pslli_w: - if (Value *V = SimplifyX86immshift(*II, *Builder, true)) + if (Value *V = SimplifyX86immshift(*II, *Builder, true, true)) return ReplaceInstUsesWith(*II, V); break; Index: test/CodeGen/X86/combine-avx2-intrinsics.ll =================================================================== --- test/CodeGen/X86/combine-avx2-intrinsics.ll +++ test/CodeGen/X86/combine-avx2-intrinsics.ll @@ -3,47 +3,6 @@ ; Verify that the backend correctly combines AVX2 builtin intrinsics. -define <8 x i32> @test_psra_1(<8 x i32> %A) { - %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 3) - %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> ) - %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 2) - ret <8 x i32> %3 -} -; CHECK-LABEL: test_psra_1 -; CHECK: vpsrad $8, %ymm0, %ymm0 -; CHECK-NEXT: ret - -define <16 x i16> @test_psra_2(<16 x i16> %A) { - %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 3) - %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> ) - %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 2) - ret <16 x i16> %3 -} -; CHECK-LABEL: test_psra_2 -; CHECK: vpsraw $8, %ymm0, %ymm0 -; CHECK-NEXT: ret - -define <16 x i16> @test_psra_3(<16 x i16> %A) { - %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0) - %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> ) - %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0) - ret <16 x i16> %3 -} -; CHECK-LABEL: test_psra_3 -; CHECK-NOT: vpsraw -; CHECK: ret - -define <8 x i32> @test_psra_4(<8 x i32> %A) { - %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0) - %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> ) - %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0) - ret <8 x i32> %3 -} -; CHECK-LABEL: test_psra_4 -; CHECK-NOT: vpsrad -; CHECK: ret - - define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1) { %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a0, <32 x i8> %a1) ret <32 x i8> %res @@ -157,8 +116,4 @@ declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) -declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) -declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) -declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) -declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) Index: test/CodeGen/X86/combine-sse2-intrinsics.ll =================================================================== --- test/CodeGen/X86/combine-sse2-intrinsics.ll +++ test/CodeGen/X86/combine-sse2-intrinsics.ll @@ -1,53 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s -; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s - -; Verify that the backend correctly combines SSE2 builtin intrinsics. - - -define <4 x i32> @test_psra_1(<4 x i32> %A) { - %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 3) - %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> ) - %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 2) - ret <4 x i32> %3 -} -; CHECK-LABEL: test_psra_1 -; CHECK: psrad $8, %xmm0 -; CHECK-NEXT: ret - -define <8 x i16> @test_psra_2(<8 x i16> %A) { - %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 3) - %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> ) - %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 2) - ret <8 x i16> %3 -} -; CHECK-LABEL: test_psra_2 -; CHECK: psraw $8, %xmm0 -; CHECK-NEXT: ret - -define <4 x i32> @test_psra_3(<4 x i32> %A) { - %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 0) - %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> ) - %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 0) - ret <4 x i32> %3 -} -; CHECK-LABEL: test_psra_3 -; CHECK-NOT: psrad -; CHECK: ret - - -define <8 x i16> @test_psra_4(<8 x i16> %A) { - %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0) - %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> ) - %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 0) - ret <8 x i16> %3 -} -; CHECK-LABEL: test_psra_4 -; CHECK-NOT: psraw -; CHECK: ret - - -declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) -declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) -declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) -declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) - Index: test/Transforms/InstCombine/x86-vector-shifts.ll =================================================================== --- test/Transforms/InstCombine/x86-vector-shifts.ll +++ test/Transforms/InstCombine/x86-vector-shifts.ll @@ -2,6 +2,102 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; +; ASHR - Immediate +; + +define <8 x i16> @sse2_psrai_w_0(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_w_0 +; CHECK-NEXT: ret <8 x i16> %v + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 0) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psrai_w_15(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_w_15 +; CHECK-NEXT: %1 = ashr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 15) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psrai_w_64(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_w_64 +; CHECK-NEXT: %1 = ashr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 64) + ret <8 x i16> %1 +} + +define <4 x i32> @sse2_psrai_d_0(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_d_0 +; CHECK-NEXT: ret <4 x i32> %v + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 0) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psrai_d_15(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_d_15 +; CHECK-NEXT: %1 = ashr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 15) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psrai_d_64(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_d_64 +; CHECK-NEXT: %1 = ashr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 64) + ret <4 x i32> %1 +} + +define <16 x i16> @avx2_psrai_w_0(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_w_0 +; CHECK-NEXT: ret <16 x i16> %v + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 0) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psrai_w_15(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_w_15 +; CHECK-NEXT: %1 = ashr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 15) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psrai_w_64(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_w_64 +; CHECK-NEXT: %1 = ashr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 64) + ret <16 x i16> %1 +} + +define <8 x i32> @avx2_psrai_d_0(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_d_0 +; CHECK-NEXT: ret <8 x i32> %v + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 0) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psrai_d_15(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_d_15 +; CHECK-NEXT: %1 = ashr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 15) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psrai_d_64(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_d_64 +; CHECK-NEXT: %1 = ashr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 64) + ret <8 x i32> %1 +} + +; ; LSHR - Immediate ; @@ -274,6 +370,134 @@ } ; +; ASHR - Constant Vector +; + +define <8 x i16> @sse2_psra_w_0(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_0 +; CHECK-NEXT: ret <8 x i16> %v + %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> zeroinitializer) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psra_w_15(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_15 +; CHECK-NEXT: %1 = ashr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psra_w_15_splat(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_15_splat +; CHECK-NEXT: %1 = ashr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psra_w_64(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_64 +; CHECK-NEXT: %1 = ashr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <4 x i32> @sse2_psra_d_0(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_0 +; CHECK-NEXT: ret <4 x i32> %v + %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> zeroinitializer) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psra_d_15(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_15 +; CHECK-NEXT: %1 = ashr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psra_d_15_splat(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_15_splat +; CHECK-NEXT: %1 = ashr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psra_d_64(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_64 +; CHECK-NEXT: %1 = ashr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <16 x i16> @avx2_psra_w_0(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_0 +; CHECK-NEXT: ret <16 x i16> %v + %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> zeroinitializer) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psra_w_15(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_15 +; CHECK-NEXT: %1 = ashr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psra_w_15_splat(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_15_splat +; CHECK-NEXT: %1 = ashr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psra_w_64(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_64 +; CHECK-NEXT: %1 = ashr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <8 x i32> @avx2_psra_d_0(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_0 +; CHECK-NEXT: ret <8 x i32> %v + %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> zeroinitializer) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psra_d_15(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_15 +; CHECK-NEXT: %1 = ashr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psra_d_15_splat(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_15_splat +; CHECK-NEXT: %1 = ashr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psra_d_64(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_64 +; CHECK-NEXT: %1 = ashr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +; ; LSHR - Constant Vector ; @@ -605,6 +829,82 @@ ; Constant Folding ; +define <8 x i16> @test_sse2_psra_w_0(<8 x i16> %A) { +; CHECK-LABEL: @test_sse2_psra_w_0 +; CHECK-NEXT: ret <8 x i16> %A + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0) + %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> ) + %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 0) + ret <8 x i16> %3 +} + +define <8 x i16> @test_sse2_psra_w_8() { +; CHECK-LABEL: @test_sse2_psra_w_8 +; CHECK-NEXT: ret <8 x i16> + %1 = bitcast <2 x i64> to <8 x i16> + %2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %1, i32 3) + %3 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %2, <8 x i16> ) + %4 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %3, i32 2) + ret <8 x i16> %4 +} + +define <4 x i32> @test_sse2_psra_d_0(<4 x i32> %A) { +; CHECK-LABEL: @test_sse2_psra_d_0 +; CHECK-NEXT: ret <4 x i32> %A + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 0) + %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> ) + %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %1, i32 0) + ret <4 x i32> %3 +} + +define <4 x i32> @sse2_psra_d_8() { +; CHECK-LABEL: @sse2_psra_d_8 +; CHECK-NEXT: ret <4 x i32> + %1 = bitcast <2 x i64> to <4 x i32> + %2 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %1, i32 3) + %3 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %2, <4 x i32> ) + %4 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %3, i32 2) + ret <4 x i32> %4 +} + +define <16 x i16> @test_avx2_psra_w_0(<16 x i16> %A) { +; CHECK-LABEL: @test_avx2_psra_w_0 +; CHECK-NEXT: ret <16 x i16> %A + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0) + %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> ) + %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0) + ret <16 x i16> %3 +} + +define <16 x i16> @test_avx2_psra_w_8(<16 x i16> %A) { +; CHECK-LABEL: @test_avx2_psra_w_8 +; CHECK-NEXT: ret <16 x i16> + %1 = bitcast <4 x i64> to <16 x i16> + %2 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %1, i32 3) + %3 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %2, <8 x i16> ) + %4 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %3, i32 2) + ret <16 x i16> %4 +} + +define <8 x i32> @test_avx2_psra_d_0(<8 x i32> %A) { +; CHECK-LABEL: @test_avx2_psra_d_0 +; CHECK-NEXT: ret <8 x i32> %A + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0) + %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> ) + %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0) + ret <8 x i32> %3 +} + +define <8 x i32> @test_avx2_psra_d_8() { +; CHECK-LABEL: @test_avx2_psra_d_8 +; CHECK-NEXT: ret <8 x i32> + %1 = bitcast <4 x i64> to <8 x i32> + %2 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %1, i32 3) + %3 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %2, <4 x i32> ) + %4 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %3, i32 2) + ret <8 x i32> %4 +} + define <2 x i64> @test_sse2_1() nounwind readnone uwtable { %S = bitcast i32 1 to i32 %1 = zext i32 %S to i64 @@ -814,4 +1114,13 @@ declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1 declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1 +declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) #1 +declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) #1 +declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) #1 +declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) #1 +declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) #1 +declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) #1 +declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) #1 +declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) #1 + attributes #1 = { nounwind readnone }