Index: llvm/trunk/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/trunk/lib/IR/AutoUpgrade.cpp +++ llvm/trunk/lib/IR/AutoUpgrade.cpp @@ -132,6 +132,7 @@ Name.startswith("x86.avx2.vbroadcast") || Name.startswith("x86.avx2.pbroadcast") || Name.startswith("x86.avx.vpermil.") || + Name.startswith("x86.sse41.pmovsx") || Name == "x86.avx.vinsertf128.pd.256" || Name == "x86.avx.vinsertf128.ps.256" || Name == "x86.avx.vinsertf128.si.256" || @@ -440,6 +441,19 @@ for (unsigned I = 0; I < EltNum; ++I) Rep = Builder.CreateInsertElement(Rep, Load, ConstantInt::get(I32Ty, I)); + } else if (Name.startswith("llvm.x86.sse41.pmovsx")) { + VectorType *SrcTy = cast(CI->getArgOperand(0)->getType()); + VectorType *DstTy = cast(CI->getType()); + unsigned NumDstElts = DstTy->getNumElements(); + + // Extract a subvector of the first NumDstElts lanes and sign extend. + SmallVector ShuffleMask; + for (int i = 0; i != (int)NumDstElts; ++i) + ShuffleMask.push_back(i); + + Value *SV = Builder.CreateShuffleVector( + CI->getArgOperand(0), UndefValue::get(SrcTy), ShuffleMask); + Rep = Builder.CreateSExt(SV, DstTy); } else if (Name == "llvm.x86.avx2.vbroadcasti128") { // Replace vbroadcasts with a vector shuffle. Type *VT = VectorType::get(Type::getInt64Ty(C), 2); @@ -527,10 +541,10 @@ unsigned Imm = cast(CI->getArgOperand(2))->getZExtValue(); VectorType *VecTy = cast(CI->getType()); unsigned NumElts = VecTy->getNumElements(); - + // Mask off the high bits of the immediate value; hardware ignores those. Imm = Imm & 1; - + // Extend the second operand into a vector that is twice as big. Value *UndefV = UndefValue::get(Op1->getType()); SmallVector Idxs; @@ -572,7 +586,7 @@ unsigned Imm = cast(CI->getArgOperand(1))->getZExtValue(); VectorType *VecTy = cast(CI->getType()); unsigned NumElts = VecTy->getNumElements(); - + // Mask off the high bits of the immediate value; hardware ignores those. Imm = Imm & 1; Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -681,13 +681,13 @@ X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG, X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0), - X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0), - X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0), - X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0), - X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0), X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM, X86ISD::FGETEXP_RND, 0), X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM, X86ISD::FGETEXP_RND, 0), @@ -1628,12 +1628,6 @@ X86_INTRINSIC_DATA(sse41_pminsd, INTR_TYPE_2OP, ISD::SMIN, 0), X86_INTRINSIC_DATA(sse41_pminud, INTR_TYPE_2OP, ISD::UMIN, 0), X86_INTRINSIC_DATA(sse41_pminuw, INTR_TYPE_2OP, ISD::UMIN, 0), - X86_INTRINSIC_DATA(sse41_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(sse41_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(sse41_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(sse41_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0), Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -936,12 +936,6 @@ break; } - case Intrinsic::x86_sse41_pmovsxbd: - case Intrinsic::x86_sse41_pmovsxbq: - case Intrinsic::x86_sse41_pmovsxbw: - case Intrinsic::x86_sse41_pmovsxdq: - case Intrinsic::x86_sse41_pmovsxwd: - case Intrinsic::x86_sse41_pmovsxwq: case Intrinsic::x86_avx2_pmovsxbd: case Intrinsic::x86_avx2_pmovsxbq: case Intrinsic::x86_avx2_pmovsxbw: Index: llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -143,3 +143,69 @@ ret <8 x i16> %res } declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone + + +define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) { +; CHECK-LABEL: test_x86_sse41_pmovsxbd: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone + + +define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) { +; CHECK-LABEL: test_x86_sse41_pmovsxbq: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) { +; CHECK-LABEL: test_x86_sse41_pmovsxbw: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone + + +define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) { +; CHECK-LABEL: test_x86_sse41_pmovsxdq: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone + + +define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) { +; CHECK-LABEL: test_x86_sse41_pmovsxwd: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone + + +define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) { +; CHECK-LABEL: test_x86_sse41_pmovsxwq: +; CHECK: # BB#0: +; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone Index: llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll +++ llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -1251,72 +1251,6 @@ declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone -define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovsxbd: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 -; CHECK-NEXT: retl - %res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone - - -define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovsxbq: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0 -; CHECK-NEXT: retl - %res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovsxbw: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 -; CHECK-NEXT: retl - %res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone - - -define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovsxdq: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 -; CHECK-NEXT: retl - %res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone - - -define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovsxwd: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 -; CHECK-NEXT: retl - %res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone - - -define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovsxwq: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0 -; CHECK-NEXT: retl - %res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone - - define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) { ; CHECK-LABEL: test_x86_sse41_pmovzxbd: ; CHECK: ## BB#0: @@ -3378,7 +3312,7 @@ ; CHECK-LABEL: movnt_dq: ; CHECK: ## BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: vpaddq LCPI282_0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq LCPI276_0, %xmm0, %xmm0 ; CHECK-NEXT: vmovntdq %ymm0, (%eax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl Index: llvm/trunk/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll @@ -42,7 +42,6 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone - define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK: mpsadbw %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1] @@ -59,3 +58,49 @@ declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone +define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) { + ; CHECK: pmovsxbd + %res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone + + +define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) { + ; CHECK: pmovsxbq + %res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) { + ; CHECK: pmovsxbw + %res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone + + +define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) { + ; CHECK: pmovsxdq + %res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone + + +define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) { + ; CHECK: pmovsxwd + %res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone + + +define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) { + ; CHECK: pmovsxwq + %res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone Index: llvm/trunk/test/CodeGen/X86/sse41-intrinsics-x86.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse41-intrinsics-x86.ll +++ llvm/trunk/test/CodeGen/X86/sse41-intrinsics-x86.ll @@ -162,54 +162,6 @@ declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone -define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) { - ; CHECK: pmovsxbd - %res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone - - -define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) { - ; CHECK: pmovsxbq - %res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) { - ; CHECK: pmovsxbw - %res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone - - -define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) { - ; CHECK: pmovsxdq - %res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone - - -define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) { - ; CHECK: pmovsxwd - %res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone - - -define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) { - ; CHECK: pmovsxwq - %res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone - - define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) { ; CHECK: pmovzxbd %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1] Index: llvm/trunk/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll @@ -6,8 +6,9 @@ ; SSE41: pmovsxbw (%rdi), %xmm0 ; AVX: vpmovsxbw (%rdi), %xmm0 %1 = load <16 x i8>, <16 x i8>* %a, align 1 - %2 = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %1) - ret <8 x i16> %2 + %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> + %3 = sext <8 x i8> %2 to <8 x i16> + ret <8 x i16> %3 } define <4 x i32> @test_llvm_x86_sse41_pmovsxbd(<16 x i8>* %a) { @@ -15,8 +16,9 @@ ; SSE41: pmovsxbd (%rdi), %xmm0 ; AVX: vpmovsxbd (%rdi), %xmm0 %1 = load <16 x i8>, <16 x i8>* %a, align 1 - %2 = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %1) - ret <4 x i32> %2 + %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <4 x i32> + %3 = sext <4 x i8> %2 to <4 x i32> + ret <4 x i32> %3 } define <2 x i64> @test_llvm_x86_sse41_pmovsxbq(<16 x i8>* %a) { @@ -24,8 +26,9 @@ ; SSE41: pmovsxbq (%rdi), %xmm0 ; AVX: vpmovsxbq (%rdi), %xmm0 %1 = load <16 x i8>, <16 x i8>* %a, align 1 - %2 = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %1) - ret <2 x i64> %2 + %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <2 x i32> + %3 = sext <2 x i8> %2 to <2 x i64> + ret <2 x i64> %3 } define <4 x i32> @test_llvm_x86_sse41_pmovsxwd(<8 x i16>* %a) { @@ -33,8 +36,9 @@ ; SSE41: pmovsxwd (%rdi), %xmm0 ; AVX: vpmovsxwd (%rdi), %xmm0 %1 = load <8 x i16>, <8 x i16>* %a, align 1 - %2 = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) - ret <4 x i32> %2 + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> + %3 = sext <4 x i16> %2 to <4 x i32> + ret <4 x i32> %3 } define <2 x i64> @test_llvm_x86_sse41_pmovsxwq(<8 x i16>* %a) { @@ -42,8 +46,9 @@ ; SSE41: pmovsxwq (%rdi), %xmm0 ; AVX: vpmovsxwq (%rdi), %xmm0 %1 = load <8 x i16>, <8 x i16>* %a, align 1 - %2 = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %1) - ret <2 x i64> %2 + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <2 x i32> + %3 = sext <2 x i16> %2 to <2 x i64> + ret <2 x i64> %3 } define <2 x i64> @test_llvm_x86_sse41_pmovsxdq(<4 x i32>* %a) { @@ -51,8 +56,9 @@ ; SSE41: pmovsxdq (%rdi), %xmm0 ; AVX: vpmovsxdq (%rdi), %xmm0 %1 = load <4 x i32>, <4 x i32>* %a, align 1 - %2 = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %1) - ret <2 x i64> %2 + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <2 x i32> + %3 = sext <2 x i32> %2 to <2 x i64> + ret <2 x i64> %3 } define <8 x i16> @test_llvm_x86_sse41_pmovzxbw(<16 x i8>* %a) { @@ -115,9 +121,3 @@ declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) -declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) -declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) -declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) -declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) -declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) -declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) Index: llvm/trunk/test/CodeGen/X86/sse41.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse41.ll +++ llvm/trunk/test/CodeGen/X86/sse41.ll @@ -31,49 +31,6 @@ ret <16 x i8> %tmp1 } -define <2 x i64> @pmovsxbd_1(i32* %p) nounwind { -; X32-LABEL: pmovsxbd_1: -; X32: ## BB#0: ## %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pmovsxbd (%eax), %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: pmovsxbd_1: -; X64: ## BB#0: ## %entry -; X64-NEXT: pmovsxbd (%rdi), %xmm0 -; X64-NEXT: retq -entry: - %0 = load i32, i32* %p, align 4 - %1 = insertelement <4 x i32> undef, i32 %0, i32 0 - %2 = insertelement <4 x i32> %1, i32 0, i32 1 - %3 = insertelement <4 x i32> %2, i32 0, i32 2 - %4 = insertelement <4 x i32> %3, i32 0, i32 3 - %5 = bitcast <4 x i32> %4 to <16 x i8> - %6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone - %7 = bitcast <4 x i32> %6 to <2 x i64> - ret <2 x i64> %7 -} - -define <2 x i64> @pmovsxwd_1(i64* %p) nounwind readonly { -; X32-LABEL: pmovsxwd_1: -; X32: ## BB#0: ## %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pmovsxwd (%eax), %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: pmovsxwd_1: -; X64: ## BB#0: ## %entry -; X64-NEXT: pmovsxwd (%rdi), %xmm0 -; X64-NEXT: retq -entry: - %0 = load i64, i64* %p ; [#uses=1] - %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0 ; <<2 x i64>> [#uses=1] - %1 = bitcast <2 x i64> %tmp2 to <8 x i16> ; <<8 x i16>> [#uses=1] - %2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone ; <<4 x i32>> [#uses=1] - %3 = bitcast <4 x i32> %2 to <2 x i64> ; <<2 x i64>> [#uses=1] - ret <2 x i64> %3 -} - define <2 x i64> @pmovzxbq_1() nounwind { ; X32-LABEL: pmovzxbq_1: ; X32: ## BB#0: ## %entry @@ -94,8 +51,6 @@ ret <2 x i64> %3 } -declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone -declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone define i32 @extractps_1(<4 x float> %v) nounwind { @@ -137,7 +92,7 @@ ; X32: ## BB#0: ; X32-NEXT: pushl %eax ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; X32-NEXT: addss LCPI7_0, %xmm0 +; X32-NEXT: addss LCPI5_0, %xmm0 ; X32-NEXT: movss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax