Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -198,8 +198,52 @@
}
static Value *SimplifyX86immshift(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder,
- bool LogicalShift, bool ShiftLeft) {
+ InstCombiner::BuilderTy &Builder) {
+ bool LogicalShift = false;
+ bool ShiftLeft = false;
+
+ switch (II.getIntrinsicID()) {
+ default:
+ return nullptr;
+ case Intrinsic::x86_sse2_psra_d:
+ case Intrinsic::x86_sse2_psra_w:
+ case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_sse2_psrai_w:
+ case Intrinsic::x86_avx2_psra_d:
+ case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_avx2_psrai_d:
+ case Intrinsic::x86_avx2_psrai_w:
+ LogicalShift = false; ShiftLeft = false;
+ break;
+ case Intrinsic::x86_sse2_psrl_d:
+ case Intrinsic::x86_sse2_psrl_q:
+ case Intrinsic::x86_sse2_psrl_w:
+ case Intrinsic::x86_sse2_psrli_d:
+ case Intrinsic::x86_sse2_psrli_q:
+ case Intrinsic::x86_sse2_psrli_w:
+ case Intrinsic::x86_avx2_psrl_d:
+ case Intrinsic::x86_avx2_psrl_q:
+ case Intrinsic::x86_avx2_psrl_w:
+ case Intrinsic::x86_avx2_psrli_d:
+ case Intrinsic::x86_avx2_psrli_q:
+ case Intrinsic::x86_avx2_psrli_w:
+ LogicalShift = true; ShiftLeft = false;
+ break;
+ case Intrinsic::x86_sse2_psll_d:
+ case Intrinsic::x86_sse2_psll_q:
+ case Intrinsic::x86_sse2_psll_w:
+ case Intrinsic::x86_sse2_pslli_d:
+ case Intrinsic::x86_sse2_pslli_q:
+ case Intrinsic::x86_sse2_pslli_w:
+ case Intrinsic::x86_avx2_psll_d:
+ case Intrinsic::x86_avx2_psll_q:
+ case Intrinsic::x86_avx2_psll_w:
+ case Intrinsic::x86_avx2_pslli_d:
+ case Intrinsic::x86_avx2_pslli_q:
+ case Intrinsic::x86_avx2_pslli_w:
+ LogicalShift = true; ShiftLeft = true;
+ break;
+ }
assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
// Simplify if count is constant.
@@ -788,51 +832,64 @@
}
// Constant fold ashr( , Ci ).
- case Intrinsic::x86_sse2_psra_d:
- case Intrinsic::x86_sse2_psra_w:
+ // Constant fold lshr( , Ci ).
+ // Constant fold shl( , Ci ).
case Intrinsic::x86_sse2_psrai_d:
case Intrinsic::x86_sse2_psrai_w:
- case Intrinsic::x86_avx2_psra_d:
- case Intrinsic::x86_avx2_psra_w:
case Intrinsic::x86_avx2_psrai_d:
case Intrinsic::x86_avx2_psrai_w:
- if (Value *V = SimplifyX86immshift(*II, *Builder, false, false))
- return ReplaceInstUsesWith(*II, V);
- break;
-
- // Constant fold lshr( , Ci ).
- case Intrinsic::x86_sse2_psrl_d:
- case Intrinsic::x86_sse2_psrl_q:
- case Intrinsic::x86_sse2_psrl_w:
case Intrinsic::x86_sse2_psrli_d:
case Intrinsic::x86_sse2_psrli_q:
case Intrinsic::x86_sse2_psrli_w:
- case Intrinsic::x86_avx2_psrl_d:
- case Intrinsic::x86_avx2_psrl_q:
- case Intrinsic::x86_avx2_psrl_w:
case Intrinsic::x86_avx2_psrli_d:
case Intrinsic::x86_avx2_psrli_q:
case Intrinsic::x86_avx2_psrli_w:
- if (Value *V = SimplifyX86immshift(*II, *Builder, true, false))
+ case Intrinsic::x86_sse2_pslli_d:
+ case Intrinsic::x86_sse2_pslli_q:
+ case Intrinsic::x86_sse2_pslli_w:
+ case Intrinsic::x86_avx2_pslli_d:
+ case Intrinsic::x86_avx2_pslli_q:
+ case Intrinsic::x86_avx2_pslli_w:
+ if (Value *V = SimplifyX86immshift(*II, *Builder))
return ReplaceInstUsesWith(*II, V);
break;
- // Constant fold shl( , Ci ).
+ case Intrinsic::x86_sse2_psra_d:
+ case Intrinsic::x86_sse2_psra_w:
+ case Intrinsic::x86_avx2_psra_d:
+ case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_sse2_psrl_d:
+ case Intrinsic::x86_sse2_psrl_q:
+ case Intrinsic::x86_sse2_psrl_w:
+ case Intrinsic::x86_avx2_psrl_d:
+ case Intrinsic::x86_avx2_psrl_q:
+ case Intrinsic::x86_avx2_psrl_w:
case Intrinsic::x86_sse2_psll_d:
case Intrinsic::x86_sse2_psll_q:
case Intrinsic::x86_sse2_psll_w:
- case Intrinsic::x86_sse2_pslli_d:
- case Intrinsic::x86_sse2_pslli_q:
- case Intrinsic::x86_sse2_pslli_w:
case Intrinsic::x86_avx2_psll_d:
case Intrinsic::x86_avx2_psll_q:
- case Intrinsic::x86_avx2_psll_w:
- case Intrinsic::x86_avx2_pslli_d:
- case Intrinsic::x86_avx2_pslli_q:
- case Intrinsic::x86_avx2_pslli_w:
- if (Value *V = SimplifyX86immshift(*II, *Builder, true, true))
+ case Intrinsic::x86_avx2_psll_w: {
+ if (Value *V = SimplifyX86immshift(*II, *Builder))
return ReplaceInstUsesWith(*II, V);
+
+ // SSE2/AVX2 only uses all the first 64-bits of the 128-bit vector
+ // operand to compute the shift amount.
+ auto ShiftAmt = II->getArgOperand(1);
+ auto ShiftType = cast(ShiftAmt->getType());
+ assert(ShiftType->getPrimitiveSizeInBits() == 128 &&
+ "Unexpected packed shift size");
+ unsigned VWidth = ShiftType->getNumElements();
+
+ APInt DemandedElts = APInt::getLowBitsSet(VWidth, VWidth / 2);
+ APInt UndefElts(VWidth, 0);
+ if (Value *V =
+ SimplifyDemandedVectorElts(ShiftAmt, DemandedElts, UndefElts)) {
+ II->setArgOperand(1, V);
+ return II;
+ }
break;
+ }
case Intrinsic::x86_sse41_pmovsxbd:
case Intrinsic::x86_sse41_pmovsxbq:
Index: test/Transforms/InstCombine/x86-vector-shifts.ll
===================================================================
--- test/Transforms/InstCombine/x86-vector-shifts.ll
+++ test/Transforms/InstCombine/x86-vector-shifts.ll
@@ -826,6 +826,154 @@
}
;
+; Vector Demanded Bits
+;
+
+define <8 x i16> @sse2_psra_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
+; CHECK-LABEL: @sse2_psra_w_var
+; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %a)
+; CHECK-NEXT: ret <8 x i16> %1
+ %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32>
+ %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1)
+ ret <8 x i16> %2
+}
+
+define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
+; CHECK-LABEL: @sse2_psra_d_var
+; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a)
+; CHECK-NEXT: ret <4 x i32> %1
+ %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32>
+ %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1)
+ ret <4 x i32> %2
+}
+
+define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
+; CHECK-LABEL: @avx2_psra_w_var
+; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a)
+; CHECK-NEXT: ret <16 x i16> %1
+ %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32>
+ %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %1)
+ ret <16 x i16> %2
+}
+
+define <8 x i32> @avx2_psra_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
+; CHECK-LABEL: @avx2_psra_d_var
+; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %a)
+; CHECK-NEXT: ret <8 x i32> %1
+ %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32>
+ %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %1)
+ ret <8 x i32> %2
+}
+
+define <8 x i16> @sse2_psrl_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
+; CHECK-LABEL: @sse2_psrl_w_var
+; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %a)
+; CHECK-NEXT: ret <8 x i16> %1
+ %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32>
+ %2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %1)
+ ret <8 x i16> %2
+}
+
+define <4 x i32> @sse2_psrl_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
+; CHECK-LABEL: @sse2_psrl_d_var
+; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %a)
+; CHECK-NEXT: ret <4 x i32> %1
+ %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32>
+ %2 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %1)
+ ret <4 x i32> %2
+}
+
+define <2 x i64> @sse2_psrl_q_var(<2 x i64> %v, <2 x i64> %a) nounwind readnone uwtable {
+; CHECK-LABEL: @sse2_psrl_q_var
+; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %a)
+; CHECK-NEXT: ret <2 x i64> %1
+ %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32>
+ %2 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %1)
+ ret <2 x i64> %2
+}
+
+define <16 x i16> @avx2_psrl_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
+; CHECK-LABEL: @avx2_psrl_w_var
+; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %a)
+; CHECK-NEXT: ret <16 x i16> %1
+ %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32>
+ %2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1)
+ ret <16 x i16> %2
+}
+
+define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
+; CHECK-LABEL: @avx2_psrl_d_var
+; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a)
+; CHECK-NEXT: ret <8 x i32> %1
+ %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32>
+ %2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1)
+ ret <8 x i32> %2
+}
+
+define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) nounwind readnone uwtable {
+; CHECK-LABEL: @avx2_psrl_q_var
+; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a)
+; CHECK-NEXT: ret <4 x i64> %1
+ %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32>
+ %2 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %1)
+ ret <4 x i64> %2
+}
+
+define <8 x i16> @sse2_psll_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
+; CHECK-LABEL: @sse2_psll_w_var
+; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %a)
+; CHECK-NEXT: ret <8 x i16> %1
+ %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32>
+ %2 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %1)
+ ret <8 x i16> %2
+}
+
+define <4 x i32> @sse2_psll_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
+; CHECK-LABEL: @sse2_psll_d_var
+; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %a)
+; CHECK-NEXT: ret <4 x i32> %1
+ %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32>
+ %2 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %1)
+ ret <4 x i32> %2
+}
+
+define <2 x i64> @sse2_psll_q_var(<2 x i64> %v, <2 x i64> %a) nounwind readnone uwtable {
+; CHECK-LABEL: @sse2_psll_q_var
+; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %a)
+; CHECK-NEXT: ret <2 x i64> %1
+ %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32>
+ %2 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %1)
+ ret <2 x i64> %2
+}
+
+define <16 x i16> @avx2_psll_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
+; CHECK-LABEL: @avx2_psll_w_var
+; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %a)
+; CHECK-NEXT: ret <16 x i16> %1
+ %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32>
+ %2 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %1)
+ ret <16 x i16> %2
+}
+
+define <8 x i32> @avx2_psll_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
+; CHECK-LABEL: @avx2_psll_d_var
+; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %a)
+; CHECK-NEXT: ret <8 x i32> %1
+ %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32>
+ %2 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %1)
+ ret <8 x i32> %2
+}
+
+define <4 x i64> @avx2_psll_q_var(<4 x i64> %v, <2 x i64> %a) nounwind readnone uwtable {
+; CHECK-LABEL: @avx2_psll_q_var
+; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %a)
+; CHECK-NEXT: ret <4 x i64> %1
+ %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32>
+ %2 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %1)
+ ret <4 x i64> %2
+}
+
+;
; Constant Folding
;