Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -198,6 +198,24 @@ return nullptr; } +static Value *SimplifyX86extend(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder, + bool SignExtend) { + VectorType *SrcTy = cast(II.getArgOperand(0)->getType()); + VectorType *DstTy = cast(II.getType()); + unsigned NumDstElts = DstTy->getNumElements(); + + // Extract a subvector of the first NumDstElts lanes and sign/zero extend. + SmallVector ShuffleMask; + for (int i = 0; i != NumDstElts; ++i) + ShuffleMask.push_back(i); + + Value *SV = Builder.CreateShuffleVector(II.getArgOperand(0), + UndefValue::get(SrcTy), ShuffleMask); + return SignExtend ? Builder.CreateSExt(SV, DstTy) + : Builder.CreateZExt(SV, DstTy); +} + static Value *SimplifyX86insertps(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { if (auto *CInt = dyn_cast(II.getArgOperand(2))) { @@ -778,25 +796,38 @@ Builder->CreateVectorSplat(VWidth, VTCI)); } + case Intrinsic::x86_sse41_pmovsxbd: + case Intrinsic::x86_sse41_pmovsxbq: case Intrinsic::x86_sse41_pmovsxbw: - case Intrinsic::x86_sse41_pmovsxwd: case Intrinsic::x86_sse41_pmovsxdq: + case Intrinsic::x86_sse41_pmovsxwd: + case Intrinsic::x86_sse41_pmovsxwq: + case Intrinsic::x86_avx2_pmovsxbd: + case Intrinsic::x86_avx2_pmovsxbq: + case Intrinsic::x86_avx2_pmovsxbw: + case Intrinsic::x86_avx2_pmovsxdq: + case Intrinsic::x86_avx2_pmovsxwd: + case Intrinsic::x86_avx2_pmovsxwq: + if (Value *V = SimplifyX86extend(*II, *Builder, true)) + return ReplaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse41_pmovzxbd: + case Intrinsic::x86_sse41_pmovzxbq: case Intrinsic::x86_sse41_pmovzxbw: + case Intrinsic::x86_sse41_pmovzxdq: case Intrinsic::x86_sse41_pmovzxwd: - case Intrinsic::x86_sse41_pmovzxdq: { - // pmov{s|z}x ignores the upper half of their input vectors. - unsigned VWidth = - cast(II->getArgOperand(0)->getType())->getNumElements(); - unsigned LowHalfElts = VWidth / 2; - APInt InputDemandedElts(APInt::getBitsSet(VWidth, 0, LowHalfElts)); - APInt UndefElts(VWidth, 0); - if (Value *TmpV = SimplifyDemandedVectorElts( - II->getArgOperand(0), InputDemandedElts, UndefElts)) { - II->setArgOperand(0, TmpV); - return II; - } + case Intrinsic::x86_sse41_pmovzxwq: + case Intrinsic::x86_avx2_pmovzxbd: + case Intrinsic::x86_avx2_pmovzxbq: + case Intrinsic::x86_avx2_pmovzxbw: + case Intrinsic::x86_avx2_pmovzxdq: + case Intrinsic::x86_avx2_pmovzxwd: + case Intrinsic::x86_avx2_pmovzxwq: + if (Value *V = SimplifyX86extend(*II, *Builder, false)) + return ReplaceInstUsesWith(*II, V); break; - } + case Intrinsic::x86_sse41_insertps: if (Value *V = SimplifyX86insertps(*II, *Builder)) return ReplaceInstUsesWith(*II, V); Index: llvm/trunk/test/Transforms/InstCombine/vec_demanded_elts.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/vec_demanded_elts.ll +++ llvm/trunk/test/Transforms/InstCombine/vec_demanded_elts.ll @@ -138,22 +138,6 @@ declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) -; -define <4 x i32> @kernel3_vertical(<4 x i16> * %src, <8 x i16> * %foo) nounwind { -entry: - %tmp = load <4 x i16>, <4 x i16>* %src - %tmp1 = load <8 x i16>, <8 x i16>* %foo -; CHECK: %tmp2 = shufflevector - %tmp2 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> -; pmovzxwd ignores the upper 64-bits of its input; -instcombine should remove this shuffle: -; CHECK-NOT: shufflevector - %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> -; CHECK-NEXT: pmovzxwd - %0 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp3) - ret <4 x i32> %0 -} -declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone - define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind { entry: ; CHECK-LABEL: define <4 x float> @dead_shuffle_elt( Index: llvm/trunk/test/Transforms/InstCombine/x86-pmovsx.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/x86-pmovsx.ll +++ llvm/trunk/test/Transforms/InstCombine/x86-pmovsx.ll @@ -0,0 +1,136 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone +declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone +declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone +declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone +declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone +declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone + +declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone +declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone +declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone +declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone +declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone +declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone + +; +; Basic sign extension tests +; + +define <4 x i32> @sse41_pmovsxbd(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovsxbd +; CHECK-NEXT: shufflevector <16 x i8> %v, <16 x i8> undef, <4 x i32> +; CHECK-NEXT: sext <4 x i8> %1 to <4 x i32> +; CHECK-NEXT: ret <4 x i32> %2 + + %res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %v) + ret <4 x i32> %res +} + +define <2 x i64> @sse41_pmovsxbq(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovsxbq +; CHECK-NEXT: shufflevector <16 x i8> %v, <16 x i8> undef, <2 x i32> +; CHECK-NEXT: sext <2 x i8> %1 to <2 x i64> +; CHECK-NEXT: ret <2 x i64> %2 + + %res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %v) + ret <2 x i64> %res +} + +define <8 x i16> @sse41_pmovsxbw(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovsxbw +; CHECK-NEXT: shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> +; CHECK-NEXT: sext <8 x i8> %1 to <8 x i16> +; CHECK-NEXT: ret <8 x i16> %2 + + %res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %v) + ret <8 x i16> %res +} + +define <2 x i64> @sse41_pmovsxdq(<4 x i32> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovsxdq +; CHECK-NEXT: shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> +; CHECK-NEXT: sext <2 x i32> %1 to <2 x i64> +; CHECK-NEXT: ret <2 x i64> %2 + + %res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %v) + ret <2 x i64> %res +} + +define <4 x i32> @sse41_pmovsxwd(<8 x i16> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovsxwd +; CHECK-NEXT: shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> +; CHECK-NEXT: sext <4 x i16> %1 to <4 x i32> +; CHECK-NEXT: ret <4 x i32> %2 + + %res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %v) + ret <4 x i32> %res +} + +define <2 x i64> @sse41_pmovsxwq(<8 x i16> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovsxwq +; CHECK-NEXT: shufflevector <8 x i16> %v, <8 x i16> undef, <2 x i32> +; CHECK-NEXT: sext <2 x i16> %1 to <2 x i64> +; CHECK-NEXT: ret <2 x i64> %2 + + %res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %v) + ret <2 x i64> %res +} + +define <8 x i32> @avx2_pmovsxbd(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovsxbd +; CHECK-NEXT: shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> +; CHECK-NEXT: sext <8 x i8> %1 to <8 x i32> +; CHECK-NEXT: ret <8 x i32> %2 + + %res = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %v) + ret <8 x i32> %res +} + +define <4 x i64> @avx2_pmovsxbq(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovsxbq +; CHECK-NEXT: shufflevector <16 x i8> %v, <16 x i8> undef, <4 x i32> +; CHECK-NEXT: sext <4 x i8> %1 to <4 x i64> +; CHECK-NEXT: ret <4 x i64> %2 + + %res = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %v) + ret <4 x i64> %res +} + +define <16 x i16> @avx2_pmovsxbw(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovsxbw +; CHECK-NEXT: sext <16 x i8> %v to <16 x i16> +; CHECK-NEXT: ret <16 x i16> %1 + + %res = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %v) + ret <16 x i16> %res +} + +define <4 x i64> @avx2_pmovsxdq(<4 x i32> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovsxdq +; CHECK-NEXT: sext <4 x i32> %v to <4 x i64> +; CHECK-NEXT: ret <4 x i64> %1 + + %res = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %v) + ret <4 x i64> %res +} + +define <8 x i32> @avx2_pmovsxwd(<8 x i16> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovsxwd +; CHECK-NEXT: sext <8 x i16> %v to <8 x i32> +; CHECK-NEXT: ret <8 x i32> %1 + + %res = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %v) + ret <8 x i32> %res +} + +define <4 x i64> @avx2_pmovsxwq(<8 x i16> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovsxwq +; CHECK-NEXT: shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> +; CHECK-NEXT: sext <4 x i16> %1 to <4 x i64> +; CHECK-NEXT: ret <4 x i64> %2 + + %res = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %v) + ret <4 x i64> %res +} Index: llvm/trunk/test/Transforms/InstCombine/x86-pmovzx.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/x86-pmovzx.ll +++ llvm/trunk/test/Transforms/InstCombine/x86-pmovzx.ll @@ -0,0 +1,136 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone +declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone +declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone +declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone +declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone +declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone + +declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone +declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone +declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone +declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone +declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone +declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone + +; +; Basic zero extension tests +; + +define <4 x i32> @sse41_pmovzxbd(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovzxbd +; CHECK-NEXT: shufflevector <16 x i8> %v, <16 x i8> undef, <4 x i32> +; CHECK-NEXT: zext <4 x i8> %1 to <4 x i32> +; CHECK-NEXT: ret <4 x i32> %2 + + %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %v) + ret <4 x i32> %res +} + +define <2 x i64> @sse41_pmovzxbq(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovzxbq +; CHECK-NEXT: shufflevector <16 x i8> %v, <16 x i8> undef, <2 x i32> +; CHECK-NEXT: zext <2 x i8> %1 to <2 x i64> +; CHECK-NEXT: ret <2 x i64> %2 + + %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %v) + ret <2 x i64> %res +} + +define <8 x i16> @sse41_pmovzxbw(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovzxbw +; CHECK-NEXT: shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> +; CHECK-NEXT: zext <8 x i8> %1 to <8 x i16> +; CHECK-NEXT: ret <8 x i16> %2 + + %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %v) + ret <8 x i16> %res +} + +define <2 x i64> @sse41_pmovzxdq(<4 x i32> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovzxdq +; CHECK-NEXT: shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> +; CHECK-NEXT: zext <2 x i32> %1 to <2 x i64> +; CHECK-NEXT: ret <2 x i64> %2 + + %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %v) + ret <2 x i64> %res +} + +define <4 x i32> @sse41_pmovzxwd(<8 x i16> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovzxwd +; CHECK-NEXT: shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> +; CHECK-NEXT: zext <4 x i16> %1 to <4 x i32> +; CHECK-NEXT: ret <4 x i32> %2 + + %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %v) + ret <4 x i32> %res +} + +define <2 x i64> @sse41_pmovzxwq(<8 x i16> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovzxwq +; CHECK-NEXT: shufflevector <8 x i16> %v, <8 x i16> undef, <2 x i32> +; CHECK-NEXT: zext <2 x i16> %1 to <2 x i64> +; CHECK-NEXT: ret <2 x i64> %2 + + %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %v) + ret <2 x i64> %res +} + +define <8 x i32> @avx2_pmovzxbd(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovzxbd +; CHECK-NEXT: shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> +; CHECK-NEXT: zext <8 x i8> %1 to <8 x i32> +; CHECK-NEXT: ret <8 x i32> %2 + + %res = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %v) + ret <8 x i32> %res +} + +define <4 x i64> @avx2_pmovzxbq(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovzxbq +; CHECK-NEXT: shufflevector <16 x i8> %v, <16 x i8> undef, <4 x i32> +; CHECK-NEXT: zext <4 x i8> %1 to <4 x i64> +; CHECK-NEXT: ret <4 x i64> %2 + + %res = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %v) + ret <4 x i64> %res +} + +define <16 x i16> @avx2_pmovzxbw(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovzxbw +; CHECK-NEXT: zext <16 x i8> %v to <16 x i16> +; CHECK-NEXT: ret <16 x i16> %1 + + %res = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %v) + ret <16 x i16> %res +} + +define <4 x i64> @avx2_pmovzxdq(<4 x i32> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovzxdq +; CHECK-NEXT: zext <4 x i32> %v to <4 x i64> +; CHECK-NEXT: ret <4 x i64> %1 + + %res = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %v) + ret <4 x i64> %res +} + +define <8 x i32> @avx2_pmovzxwd(<8 x i16> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovzxwd +; CHECK-NEXT: zext <8 x i16> %v to <8 x i32> +; CHECK-NEXT: ret <8 x i32> %1 + + %res = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %v) + ret <8 x i32> %res +} + +define <4 x i64> @avx2_pmovzxwq(<8 x i16> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovzxwq +; CHECK-NEXT: shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> +; CHECK-NEXT: zext <4 x i16> %1 to <4 x i64> +; CHECK-NEXT: ret <4 x i64> %2 + + %res = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %v) + ret <4 x i64> %res +}