Index: llvm/trunk/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td @@ -479,6 +479,8 @@ Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_cvtps2dq : GCCBuiltin<"__builtin_ia32_cvtps2dq">, Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>; + def int_x86_sse2_cvttps2dq : GCCBuiltin<"__builtin_ia32_cvttps2dq">, + Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse2_cvtsd2si : GCCBuiltin<"__builtin_ia32_cvtsd2si">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_cvtsd2si64 : GCCBuiltin<"__builtin_ia32_cvtsd2si64">, @@ -1512,8 +1514,12 @@ Intrinsic<[llvm_v4f32_ty], [llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_cvt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvtps2dq256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>; + def int_x86_avx_cvtt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvttpd2dq256">, + Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_cvt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvtpd2dq256">, Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>; + def int_x86_avx_cvtt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvttps2dq256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>; } // Vector bit test Index: llvm/trunk/lib/Analysis/ConstantFolding.cpp =================================================================== --- llvm/trunk/lib/Analysis/ConstantFolding.cpp +++ llvm/trunk/lib/Analysis/ConstantFolding.cpp @@ -1424,8 +1424,8 @@ /// integer type Ty is used to select how many bits are available for the /// result. Returns null if the conversion cannot be performed, otherwise /// returns the Constant value resulting from the conversion. -Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero, - Type *Ty) { +Constant *ConstantFoldSSEConvertToInt(const APFloat &Val, bool roundTowardZero, + Type *Ty) { // All of these conversion intrinsics form an integer of at most 64bits. unsigned ResultWidth = Ty->getIntegerBitWidth(); assert(ResultWidth <= 64 && @@ -1438,7 +1438,8 @@ APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth, /*isSigned=*/true, mode, &isExact); - if (status != APFloat::opOK && status != APFloat::opInexact) + if (status != APFloat::opOK && + (!roundTowardZero || status != APFloat::opInexact)) return nullptr; return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true); } @@ -1676,17 +1677,17 @@ case Intrinsic::x86_sse2_cvtsd2si: case Intrinsic::x86_sse2_cvtsd2si64: if (ConstantFP *FPOp = - dyn_cast_or_null(Op->getAggregateElement(0U))) - return ConstantFoldConvertToInt(FPOp->getValueAPF(), - /*roundTowardZero=*/false, Ty); + dyn_cast_or_null(Op->getAggregateElement(0U))) + return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(), + /*roundTowardZero=*/false, Ty); case Intrinsic::x86_sse_cvttss2si: case Intrinsic::x86_sse_cvttss2si64: case Intrinsic::x86_sse2_cvttsd2si: case Intrinsic::x86_sse2_cvttsd2si64: if (ConstantFP *FPOp = - dyn_cast_or_null(Op->getAggregateElement(0U))) - return ConstantFoldConvertToInt(FPOp->getValueAPF(), - /*roundTowardZero=*/true, Ty); + dyn_cast_or_null(Op->getAggregateElement(0U))) + return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(), + /*roundTowardZero=*/true, Ty); } } Index: llvm/trunk/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/trunk/lib/IR/AutoUpgrade.cpp +++ llvm/trunk/lib/IR/AutoUpgrade.cpp @@ -251,8 +251,6 @@ Name == "sse2.cvtps2pd" || Name == "avx.cvtdq2.pd.256" || Name == "avx.cvt.ps2.pd.256" || - Name == "sse2.cvttps2dq" || - Name.startswith("avx.cvtt.") || Name.startswith("avx.vinsertf128.") || Name == "avx2.vinserti128" || Name.startswith("avx.vextractf128.") || @@ -712,12 +710,6 @@ Rep = Builder.CreateSIToFP(Rep, DstTy, "cvtdq2pd"); else Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd"); - } else if (IsX86 && (Name == "sse2.cvttps2dq" || - Name.startswith("avx.cvtt."))) { - // Truncation (round to zero) float/double to i32 vector conversion. - Value *Src = CI->getArgOperand(0); - VectorType *DstTy = cast(CI->getType()); - Rep = Builder.CreateFPToSI(Src, DstTy, "cvtt"); } else if (IsX86 && Name.startswith("sse4a.movnt.")) { Module *M = F->getParent(); SmallVector Elts; Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -2009,24 +2009,35 @@ // SSE2 packed instructions with XS prefix def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; + [(set VR128:$dst, + (int_x86_sse2_cvttps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; + [(set VR128:$dst, (int_x86_sse2_cvttps2dq + (loadv4f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; + [(set VR256:$dst, + (int_x86_avx_cvtt_ps2dq_256 VR256:$src))], + IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RM>, VEX, VEX_L, + [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256 + (loadv8f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; + [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; + [(set VR128:$dst, + (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; let Predicates = [HasAVX] in { def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), @@ -2096,10 +2107,14 @@ // YMM only def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; + [(set VR128:$dst, + (int_x86_avx_cvtt_pd2dq_256 VR256:$src))], + IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; + [(set VR128:$dst, + (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; Index: llvm/trunk/test/CodeGen/X86/avx-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -681,10 +681,11 @@ ; X64-NEXT: vcvttpd2dqy %ymm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq - %cvt = fptosi <4 x double> %a0 to <4 x i32> + %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) %res = bitcast <4 x i32> %cvt to <2 x i64> ret <2 x i64> %res } +declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind { ; X32-LABEL: test_mm256_cvttps_epi32: @@ -696,10 +697,11 @@ ; X64: # BB#0: ; X64-NEXT: vcvttps2dq %ymm0, %ymm0 ; X64-NEXT: retq - %cvt = fptosi <8 x float> %a0 to <8 x i32> + %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) %res = bitcast <8 x i32> %cvt to <4 x i64> ret <4 x i64> %res } +declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind { ; X32-LABEL: test_mm256_div_pd: Index: llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -359,35 +359,12 @@ declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone -define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) { -; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vcvttpd2dqy %ymm0, %xmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retl - %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone - - -define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) { -; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 -; CHECK-NEXT: retl - %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1] - ret <8 x i32> %res -} -declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone - - define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) { ; add operation forces the execution domain. ; CHECK-LABEL: test_x86_sse2_storeu_dq: ; CHECK: ## BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: vpaddb LCPI34_0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddb LCPI32_0, %xmm0, %xmm0 ; CHECK-NEXT: vmovdqu %xmm0, (%eax) ; CHECK-NEXT: retl %a2 = add <16 x i8> %a1, Index: llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll +++ llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -3431,6 +3431,39 @@ declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone +define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) { +; AVX-LABEL: test_x86_avx_cvtt_pd2dq_256: +; AVX: ## BB#0: +; AVX-NEXT: vcvttpd2dqy %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vcvttpd2dqy %ymm0, %xmm0 +; AVX512VL-NEXT: retl + %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone + + +define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) { +; AVX-LABEL: test_x86_avx_cvtt_ps2dq_256: +; AVX: ## BB#0: +; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx_cvtt_ps2dq_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512VL-NEXT: retl + %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone + + define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) { ; AVX-LABEL: test_x86_avx_dp_ps_256: ; AVX: ## BB#0: @@ -4552,7 +4585,7 @@ ; AVX-LABEL: movnt_dq: ; AVX: ## BB#0: ; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0 +; AVX-NEXT: vpaddq LCPI256_0, %xmm0, %xmm0 ; AVX-NEXT: vmovntdq %ymm0, (%eax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retl @@ -4560,7 +4593,7 @@ ; AVX512VL-LABEL: movnt_dq: ; AVX512VL: ## BB#0: ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512VL-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddq LCPI256_0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovntdq %ymm0, (%eax) ; AVX512VL-NEXT: retl %a2 = add <2 x i64> %a1, Index: llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll +++ llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll @@ -6,13 +6,12 @@ define <4 x float> @test_mm_cvtsi64_ss(<4 x float> %a0, i64 %a1) nounwind { ; X64-LABEL: test_mm_cvtsi64_ss: ; X64: # BB#0: -; X64-NEXT: cvtsi2ssq %rdi, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: cvtsi2ssq %rdi, %xmm0 ; X64-NEXT: retq - %cvt = sitofp i64 %a1 to float - %res = insertelement <4 x float> %a0, float %cvt, i32 0 + %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ret <4 x float> %res } +declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone define i64 @test_mm_cvtss_si64(<4 x float> %a0) nounwind { ; X64-LABEL: test_mm_cvtss_si64: @@ -29,7 +28,7 @@ ; X64: # BB#0: ; X64-NEXT: cvttss2si %xmm0, %rax ; X64-NEXT: retq - %cvt = extractelement <4 x float> %a0, i32 0 - %res = fptosi float %cvt to i64 + %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0) ret i64 %res } +declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone Index: llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -707,20 +707,17 @@ define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind { ; X32-LABEL: test_mm_cvtsi32_ss: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: cvtsi2ssl %eax, %xmm1 -; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X32-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_cvtsi32_ss: ; X64: # BB#0: -; X64-NEXT: cvtsi2ssl %edi, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: cvtsi2ssl %edi, %xmm0 ; X64-NEXT: retq - %cvt = sitofp i32 %a1 to float - %res = insertelement <4 x float> %a0, float %cvt, i32 0 + %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1) ret <4 x float> %res } +declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind { ; X32-LABEL: test_mm_cvtss_f32: @@ -762,10 +759,10 @@ ; X64: # BB#0: ; X64-NEXT: cvttss2si %xmm0, %eax ; X64-NEXT: retq - %cvt = extractelement <4 x float> %a0, i32 0 - %res = fptosi float %cvt to i32 + %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ret i32 %res } +declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind { ; X32-LABEL: test_mm_cvttss_si32: @@ -777,8 +774,7 @@ ; X64: # BB#0: ; X64-NEXT: cvttss2si %xmm0, %eax ; X64-NEXT: retq - %cvt = extractelement <4 x float> %a0, i32 0 - %res = fptosi float %cvt to i32 + %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ret i32 %res } Index: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll +++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll @@ -25,13 +25,12 @@ define <2 x double> @test_mm_cvtsi64_sd(<2 x double> %a0, i64 %a1) nounwind { ; X64-LABEL: test_mm_cvtsi64_sd: ; X64: # BB#0: -; X64-NEXT: cvtsi2sdq %rdi, %xmm1 -; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X64-NEXT: cvtsi2sdq %rdi, %xmm0 ; X64-NEXT: retq - %cvt = sitofp i64 %a1 to double - %res = insertelement <2 x double> %a0, double %cvt, i32 0 + %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ret <2 x double> %res } +declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone define <2 x i64> @test_mm_cvtsi64_si128(i64 %a0) nounwind { ; X64-LABEL: test_mm_cvtsi64_si128: @@ -48,10 +47,10 @@ ; X64: # BB#0: ; X64-NEXT: cvttsd2si %xmm0, %rax ; X64-NEXT: retq - %ext = extractelement <2 x double> %a0, i32 0 - %res = fptosi double %ext to i64 + %res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0) ret i64 %res } +declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone define <2 x i64> @test_mm_loadu_si64(i64* %a0) nounwind { ; X64-LABEL: test_mm_loadu_si64: Index: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -1208,6 +1208,21 @@ } declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone +define <4 x float> @test_mm_cvtsd_ss(<4 x float> %a0, <2 x double> %a1) { +; X32-LABEL: test_mm_cvtsd_ss: +; X32: # BB#0: +; X32-NEXT: cvtsd2ss %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cvtsd_ss: +; X64: # BB#0: +; X64-NEXT: cvtsd2ss %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone + define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind { ; X32-LABEL: test_mm_cvtsi128_si32: ; X32: # BB#0: @@ -1303,10 +1318,11 @@ ; X64: # BB#0: ; X64-NEXT: cvttps2dq %xmm0, %xmm0 ; X64-NEXT: retq - %res = fptosi <4 x float> %a0 to <4 x i32> + %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) %bc = bitcast <4 x i32> %res to <2 x i64> ret <2 x i64> %bc } +declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind { ; X32-LABEL: test_mm_cvttsd_si32: @@ -1318,10 +1334,10 @@ ; X64: # BB#0: ; X64-NEXT: cvttsd2si %xmm0, %eax ; X64-NEXT: retq - %ext = extractelement <2 x double> %a0, i32 0 - %res = fptosi double %ext to i32 + %res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0) ret i32 %res } +declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone define <2 x double> @test_mm_div_pd(<2 x double> %a0, <2 x double> %a1) nounwind { ; X32-LABEL: test_mm_div_pd: Index: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -66,17 +66,6 @@ declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone -define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) { -; CHECK-LABEL: test_x86_sse2_cvttps2dq: -; CHECK: ## BB#0: -; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: retl - %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone - - define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) { ; CHECK-LABEL: test_x86_sse2_storel_dq: ; CHECK: ## BB#0: @@ -94,7 +83,7 @@ ; CHECK-LABEL: test_x86_sse2_storeu_dq: ; CHECK: ## BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: paddb LCPI8_0, %xmm0 +; CHECK-NEXT: paddb LCPI7_0, %xmm0 ; CHECK-NEXT: movdqu %xmm0, (%eax) ; CHECK-NEXT: retl %a2 = add <16 x i8> %a1, Index: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse2 | FileCheck %s --check-prefix=SSE ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL @@ -322,6 +322,22 @@ declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone +define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) { +; SSE-LABEL: test_x86_sse2_cvttps2dq: +; SSE: ## BB#0: +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: retl +; +; KNL-LABEL: test_x86_sse2_cvttps2dq: +; KNL: ## BB#0: +; KNL-NEXT: vcvttps2dq %xmm0, %xmm0 +; KNL-NEXT: retl + %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone + + define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) { ; SSE-LABEL: test_x86_sse2_cvttsd2si: ; SSE: ## BB#0: Index: llvm/trunk/test/Transforms/ConstProp/calls.ll =================================================================== --- llvm/trunk/test/Transforms/ConstProp/calls.ll +++ llvm/trunk/test/Transforms/ConstProp/calls.ll @@ -193,11 +193,13 @@ ret i1 %b } -; TODO: Inexact values should not fold as they are dependent on rounding mode +; Inexact values should not fold as they are dependent on rounding mode define i1 @test_sse_cvts_inexact() nounwind readnone { ; CHECK-LABEL: @test_sse_cvts_inexact( -; CHECK-NOT: call -; CHECK: ret i1 true +; CHECK: call +; CHECK: call +; CHECK: call +; CHECK: call entry: %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> ) nounwind %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> ) nounwind