Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -198,8 +198,52 @@ } static Value *SimplifyX86immshift(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder, - bool LogicalShift, bool ShiftLeft) { + InstCombiner::BuilderTy &Builder) { + bool LogicalShift = false; + bool ShiftLeft = false; + + switch (II.getIntrinsicID()) { + default: + return nullptr; + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + LogicalShift = false; ShiftLeft = false; + break; + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + LogicalShift = true; ShiftLeft = false; + break; + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + LogicalShift = true; ShiftLeft = true; + break; + } assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); // Simplify if count is constant. @@ -788,51 +832,64 @@ } // Constant fold ashr( , Ci ). - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_sse2_psra_w: + // Constant fold lshr( , Ci ). + // Constant fold shl( , Ci ). case Intrinsic::x86_sse2_psrai_d: case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_avx2_psra_d: - case Intrinsic::x86_avx2_psra_w: case Intrinsic::x86_avx2_psrai_d: case Intrinsic::x86_avx2_psrai_w: - if (Value *V = SimplifyX86immshift(*II, *Builder, false, false)) - return ReplaceInstUsesWith(*II, V); - break; - - // Constant fold lshr( , Ci ). - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_sse2_psrl_w: case Intrinsic::x86_sse2_psrli_d: case Intrinsic::x86_sse2_psrli_q: case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: case Intrinsic::x86_avx2_psrli_d: case Intrinsic::x86_avx2_psrli_q: case Intrinsic::x86_avx2_psrli_w: - if (Value *V = SimplifyX86immshift(*II, *Builder, true, false)) + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + if (Value *V = SimplifyX86immshift(*II, *Builder)) return ReplaceInstUsesWith(*II, V); break; - // Constant fold shl( , Ci ). + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: case Intrinsic::x86_sse2_psll_d: case Intrinsic::x86_sse2_psll_q: case Intrinsic::x86_sse2_psll_w: - case Intrinsic::x86_sse2_pslli_d: - case Intrinsic::x86_sse2_pslli_q: - case Intrinsic::x86_sse2_pslli_w: case Intrinsic::x86_avx2_psll_d: case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_avx2_psll_w: - case Intrinsic::x86_avx2_pslli_d: - case Intrinsic::x86_avx2_pslli_q: - case Intrinsic::x86_avx2_pslli_w: - if (Value *V = SimplifyX86immshift(*II, *Builder, true, true)) + case Intrinsic::x86_avx2_psll_w: { + if (Value *V = SimplifyX86immshift(*II, *Builder)) return ReplaceInstUsesWith(*II, V); + + // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + auto ShiftAmt = II->getArgOperand(1); + auto ShiftType = cast(ShiftAmt->getType()); + assert(ShiftType->getPrimitiveSizeInBits() == 128 && + "Unexpected packed shift size"); + unsigned VWidth = ShiftType->getNumElements(); + + APInt DemandedElts = APInt::getLowBitsSet(VWidth, VWidth / 2); + APInt UndefElts(VWidth, 0); + if (Value *V = + SimplifyDemandedVectorElts(ShiftAmt, DemandedElts, UndefElts)) { + II->setArgOperand(1, V); + return II; + } break; + } case Intrinsic::x86_sse41_pmovsxbd: case Intrinsic::x86_sse41_pmovsxbq: Index: llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll +++ llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll @@ -826,6 +826,154 @@ } ; +; Vector Demanded Bits +; + +define <8 x i16> @sse2_psra_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_var +; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <8 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1) + ret <8 x i16> %2 +} + +define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_var +; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <4 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1) + ret <4 x i32> %2 +} + +define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_var +; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <16 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %1) + ret <16 x i16> %2 +} + +define <8 x i32> @avx2_psra_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_var +; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <8 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %1) + ret <8 x i32> %2 +} + +define <8 x i16> @sse2_psrl_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_w_var +; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <8 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %1) + ret <8 x i16> %2 +} + +define <4 x i32> @sse2_psrl_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_d_var +; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <4 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %1) + ret <4 x i32> %2 +} + +define <2 x i64> @sse2_psrl_q_var(<2 x i64> %v, <2 x i64> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_q_var +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %1) + ret <2 x i64> %2 +} + +define <16 x i16> @avx2_psrl_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_w_var +; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <16 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1) + ret <16 x i16> %2 +} + +define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_d_var +; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <8 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1) + ret <8 x i32> %2 +} + +define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_q_var +; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <4 x i64> %1 + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %1) + ret <4 x i64> %2 +} + +define <8 x i16> @sse2_psll_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_w_var +; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <8 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %1) + ret <8 x i16> %2 +} + +define <4 x i32> @sse2_psll_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_d_var +; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <4 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %1) + ret <4 x i32> %2 +} + +define <2 x i64> @sse2_psll_q_var(<2 x i64> %v, <2 x i64> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_q_var +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %1) + ret <2 x i64> %2 +} + +define <16 x i16> @avx2_psll_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_w_var +; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <16 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %1) + ret <16 x i16> %2 +} + +define <8 x i32> @avx2_psll_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_d_var +; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <8 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %1) + ret <8 x i32> %2 +} + +define <4 x i64> @avx2_psll_q_var(<4 x i64> %v, <2 x i64> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_q_var +; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <4 x i64> %1 + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %1) + ret <4 x i64> %2 +} + +; ; Constant Folding ;