Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -3536,57 +3536,10 @@ >; } -/// sse1_fp_unop_s - SSE1 unops in scalar form. -multiclass sse1_fp_unop_s opc, string OpcodeStr, - SDNode OpNode, Intrinsic F32Int, OpndItins itins> { -let Predicates = [HasAVX], hasSideEffects = 0 in { - def V#NAME#SSr : SSI, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; - let mayLoad = 1 in { - def V#NAME#SSm : SSI, VEX_4V, VEX_LIG, - Sched<[itins.Sched.Folded, ReadAfterLd]>; - let isCodeGenOnly = 1 in - def V#NAME#SSm_Int : SSI, VEX_4V, VEX_LIG, - Sched<[itins.Sched.Folded, ReadAfterLd]>; - } -} - - def SSr : SSI, Sched<[itins.Sched]>; - // For scalar unary operations, fold a load into the operation - // only in OptForSize mode. It eliminates an instruction, but it also - // eliminates a whole-register clobber (the load), so it introduces a - // partial register update condition. - def SSm : I, XS, - Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>; -let isCodeGenOnly = 1 in { - def SSr_Int : SSI, - Sched<[itins.Sched]>; - def SSm_Int : SSI, - Sched<[itins.Sched.Folded]>; -} -} - -/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand. -multiclass sse1_fp_unop_rw opc, string OpcodeStr, SDNode OpNode, +/// sse1_fp_unop_s - SSE1 unops in scalar form +/// For the non-AVX defs, we need $src1 to be tied to $dst because +/// the HW instructions are 2 operand / destructive. +multiclass sse1_fp_unop_s opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { let Predicates = [HasAVX], hasSideEffects = 0 in { def V#NAME#SSr : SSI, +defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>, - sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd, + sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd, SSE_SQRTSD>, sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. -defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, +defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>, sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps, int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>; -defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>, +defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>, sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>, sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, int_x86_avx_rcp_ps_256, SSE_RCPP>; @@ -3869,13 +3821,15 @@ (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; } -// Reciprocal approximations. Note that these typically require refinement -// in order to obtain suitable precision. +// These are unary operations, but they are modeled as having 2 source operands +// because the high elements of the destination are unchanged in SSE. let Predicates = [UseSSE1] in { def : Pat<(int_x86_sse_rsqrt_ss VR128:$src), (RSQRTSSr_Int VR128:$src, VR128:$src)>; def : Pat<(int_x86_sse_rcp_ss VR128:$src), (RCPSSr_Int VR128:$src, VR128:$src)>; + def : Pat<(int_x86_sse_sqrt_ss VR128:$src), + (SQRTSSr_Int VR128:$src, VR128:$src)>; } // There is no f64 version of the reciprocal approximation instructions. Index: llvm/trunk/test/CodeGen/X86/sse_partial_update.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse_partial_update.ll +++ llvm/trunk/test/CodeGen/X86/sse_partial_update.ll @@ -5,11 +5,18 @@ ; There is a mismatch between the intrinsic and the actual instruction. ; The actual instruction has a partial update of dest, while the intrinsic ; passes through the upper FP values. Here, we make sure the source and -; destination of rsqrtss are the same. -define void @t1(<4 x float> %a) nounwind uwtable ssp { +; destination of each scalar unary op are the same. + +define void @rsqrtss(<4 x float> %a) nounwind uwtable ssp { entry: -; CHECK-LABEL: t1: +; CHECK-LABEL: rsqrtss: ; CHECK: rsqrtss %xmm0, %xmm0 +; CHECK-NEXT: cvtss2sd %xmm0 +; CHECK-NEXT: shufps +; CHECK-NEXT: cvtss2sd %xmm0 +; CHECK-NEXT: movap +; CHECK-NEXT: jmp + %0 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a) nounwind %a.addr.0.extract = extractelement <4 x float> %0, i32 0 %conv = fpext float %a.addr.0.extract to double @@ -21,10 +28,16 @@ declare void @callee(double, double) declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone -define void @t2(<4 x float> %a) nounwind uwtable ssp { +define void @rcpss(<4 x float> %a) nounwind uwtable ssp { entry: -; CHECK-LABEL: t2: +; CHECK-LABEL: rcpss: ; CHECK: rcpss %xmm0, %xmm0 +; CHECK-NEXT: cvtss2sd %xmm0 +; CHECK-NEXT: shufps +; CHECK-NEXT: cvtss2sd %xmm0 +; CHECK-NEXT: movap +; CHECK-NEXT: jmp + %0 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a) nounwind %a.addr.0.extract = extractelement <4 x float> %0, i32 0 %conv = fpext float %a.addr.0.extract to double @@ -34,3 +47,23 @@ ret void } declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone + +define void @sqrtss(<4 x float> %a) nounwind uwtable ssp { +entry: +; CHECK-LABEL: sqrtss: +; CHECK: sqrtss %xmm0, %xmm0 +; CHECK-NEXT: cvtss2sd %xmm0 +; CHECK-NEXT: shufps +; CHECK-NEXT: cvtss2sd %xmm0 +; CHECK-NEXT: movap +; CHECK-NEXT: jmp + + %0 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a) nounwind + %a.addr.0.extract = extractelement <4 x float> %0, i32 0 + %conv = fpext float %a.addr.0.extract to double + %a.addr.4.extract = extractelement <4 x float> %0, i32 1 + %conv3 = fpext float %a.addr.4.extract to double + tail call void @callee(double %conv, double %conv3) nounwind + ret void +} +declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone