Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -241,21 +241,20 @@ /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class multiclass sse12_fp_scalar opc, string OpcodeStr, SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, - OpndItins itins, - bit Is2Addr = 1> { + Domain d, OpndItins itins, bit Is2Addr = 1> { let isCommutable = 1 in { def rr : SI, + [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr, d>, Sched<[itins.Sched]>; } def rm : SI, + [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } @@ -263,8 +262,7 @@ multiclass sse12_fp_scalar_int opc, string OpcodeStr, RegisterClass RC, string asm, string SSEVer, string FPSizeStr, Operand memopr, ComplexPattern mem_cpat, - OpndItins itins, - bit Is2Addr = 1> { + Domain d, OpndItins itins, bit Is2Addr = 1> { let isCodeGenOnly = 1 in { def rr_Int : SI( !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr)) - RC:$src1, RC:$src2))], itins.rr>, + RC:$src1, RC:$src2))], itins.rr, d>, Sched<[itins.Sched]>; def rm_Int : SI(!strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr)) - RC:$src1, mem_cpat:$src2))], itins.rm>, + RC:$src1, mem_cpat:$src2))], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } @@ -3054,15 +3052,19 @@ multiclass basic_sse12_fp_binop_s opc, string OpcodeStr, SDNode OpNode, SizeItins itins> { defm V#NAME#SS : sse12_fp_scalar, XS, VEX_4V, VEX_LIG; + OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>, + XS, VEX_4V, VEX_LIG; defm V#NAME#SD : sse12_fp_scalar, XD, VEX_4V, VEX_LIG; + OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>, + XD, VEX_4V, VEX_LIG; let Constraints = "$src1 = $dst" in { defm SS : sse12_fp_scalar, XS; + OpNode, FR32, f32mem, SSEPackedSingle, + itins.s>, XS; defm SD : sse12_fp_scalar, XD; + OpNode, FR64, f64mem, SSEPackedDouble, + itins.d>, XD; } } @@ -3070,18 +3072,18 @@ SizeItins itins> { defm V#NAME#SS : sse12_fp_scalar_int, XS, VEX_4V, VEX_LIG; + SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG; defm V#NAME#SD : sse12_fp_scalar_int, XD, VEX_4V, VEX_LIG; + SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG; let Constraints = "$src1 = $dst" in { defm SS : sse12_fp_scalar_int, XS; + SSEPackedSingle, itins.s>, XS; defm SD : sse12_fp_scalar_int, XD; + SSEPackedDouble, itins.d>, XD; } } @@ -3170,7 +3172,7 @@ (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), (!cast(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; } - + // With SSE 4.1, insertps/blendi are preferred to movsd, so match those too. let Predicates = [UseSSE41] in { // extracted scalar math op with insert via insertps @@ -3203,7 +3205,7 @@ FR32:$src))), (iPTR 0))), (!cast("V"#OpcPrefix#SSrr_Int) v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - + // extracted scalar math op with insert via blend def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), @@ -3251,7 +3253,7 @@ FR64:$src))), (i8 1))), (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - + // vector math op with insert via blend def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), @@ -3345,17 +3347,17 @@ ValueType vt, ValueType ScalarVT, X86MemOperand x86memop, Operand vec_memop, ComplexPattern mem_cpat, Intrinsic Intr, - SDNode OpNode, OpndItins itins, Predicate target, - string Suffix> { + SDNode OpNode, Domain d, OpndItins itins, + Predicate target, string Suffix> { let hasSideEffects = 0 in { def r : I, Sched<[itins.Sched]>, + [(set RC:$dst, (OpNode RC:$src1))], itins.rr, d>, Sched<[itins.Sched]>, Requires<[target]>; let mayLoad = 1 in def m : I, + [(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>, Requires<[target, OptForSize]>; @@ -3378,7 +3380,7 @@ // because the high elements of the destination are unchanged in SSE. def : Pat<(Intr VR128:$src), (!cast(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>; - def : Pat<(Intr (load addr:$src)), + def : Pat<(Intr (load addr:$src)), (vt (COPY_TO_REGCLASS(!cast(NAME#Suffix##m) addr:$src), VR128))>; def : Pat<(Intr mem_cpat:$src), @@ -3391,24 +3393,24 @@ ValueType vt, ValueType ScalarVT, X86MemOperand x86memop, Operand vec_memop, ComplexPattern mem_cpat, - Intrinsic Intr, SDNode OpNode, OpndItins itins, - Predicate target, string Suffix> { + Intrinsic Intr, SDNode OpNode, Domain d, + OpndItins itins, Predicate target, string Suffix> { let hasSideEffects = 0 in { def r : I, Sched<[itins.Sched]>; - let mayLoad = 1 in + [], itins.rr, d>, Sched<[itins.Sched]>; + let mayLoad = 1 in def m : I, Sched<[itins.Sched.Folded, ReadAfterLd]>; + [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>; let isCodeGenOnly = 1 in { // todo: uncomment when all r_Int forms will be added to X86InstrInfo.cpp - //def r_Int : I, Sched<[itins.Sched.Folded]>; let mayLoad = 1 in - def m_Int : I, Sched<[itins.Sched.Folded, ReadAfterLd]>; @@ -3419,7 +3421,7 @@ def : Pat<(OpNode RC:$src), (!cast("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)), RC:$src)>; - def : Pat<(vt (OpNode mem_cpat:$src)), + def : Pat<(vt (OpNode mem_cpat:$src)), (!cast("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), mem_cpat:$src)>; @@ -3428,14 +3430,14 @@ // (VT (IMPLICIT_DEF)), VR128:$src)>; def : Pat<(Intr VR128:$src), (vt (COPY_TO_REGCLASS( - !cast("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)), + !cast("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)), (ScalarVT (COPY_TO_REGCLASS VR128:$src, RC))), VR128))>; def : Pat<(Intr mem_cpat:$src), (!cast("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), mem_cpat:$src)>; } let Predicates = [target, OptForSize] in - def : Pat<(ScalarVT (OpNode (load addr:$src))), + def : Pat<(ScalarVT (OpNode (load addr:$src))), (!cast("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)), addr:$src)>; } @@ -3557,11 +3559,11 @@ defm SS : sse_fp_unop_s("int_x86_sse_"##OpcodeStr##_ss), OpNode, - itins, UseSSE1, "SS">, XS; + SSEPackedSingle, itins, UseSSE1, "SS">, XS; defm V#NAME#SS : avx_fp_unop_s("int_x86_sse_"##OpcodeStr##_ss), OpNode, - itins, UseAVX, "SS">, XS, VEX_4V, VEX_LIG; + SSEPackedSingle, itins, UseAVX, "SS">, XS, VEX_4V, VEX_LIG; } multiclass sse2_fp_unop_s opc, string OpcodeStr, SDNode OpNode, @@ -3569,11 +3571,12 @@ defm SD : sse_fp_unop_s("int_x86_sse2_"##OpcodeStr##_sd), - OpNode, itins, UseSSE2, "SD">, XD; + OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD; defm V#NAME#SD : avx_fp_unop_s("int_x86_sse2_"##OpcodeStr##_sd), - OpNode, itins, UseAVX, "SD">, XD, VEX_4V, VEX_LIG; + OpNode, SSEPackedDouble, itins, UseAVX, "SD">, + XD, VEX_4V, VEX_LIG; } // Square root. Index: test/CodeGen/X86/sink-hoist.ll =================================================================== --- test/CodeGen/X86/sink-hoist.ll +++ test/CodeGen/X86/sink-hoist.ll @@ -9,7 +9,7 @@ ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: jne ; CHECK-NEXT: divsd -; CHECK-NEXT: movaps +; CHECK-NEXT: movapd ; CHECK-NEXT: ret ; CHECK: divsd @@ -28,7 +28,7 @@ ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: je ; CHECK: divsd -; CHECK: movaps +; CHECK: movapd ; CHECK: ret define double @split(double %x, double %y, i1 %c) nounwind { %a = fdiv double %x, 3.2 Index: test/CodeGen/X86/sse-minmax.ll =================================================================== --- test/CodeGen/X86/sse-minmax.ll +++ test/CodeGen/X86/sse-minmax.ll @@ -805,7 +805,7 @@ ; CHECK-LABEL: clampTo3k_a: ; CHECK-NEXT: movsd {{[^,]*}}, %xmm1 ; CHECK-NEXT: minsd %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: movapd %xmm1, %xmm0 ; CHECK-NEXT: ret ; UNSAFE-LABEL: clampTo3k_a: ; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0 @@ -813,7 +813,7 @@ ; FINITE-LABEL: clampTo3k_a: ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1 ; FINITE-NEXT: minsd %xmm0, %xmm1 -; FINITE-NEXT: movaps %xmm1, %xmm0 +; FINITE-NEXT: movapd %xmm1, %xmm0 ; FINITE-NEXT: ret define double @clampTo3k_a(double %x) nounwind readnone { entry: @@ -831,7 +831,7 @@ ; FINITE-LABEL: clampTo3k_b: ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1 ; FINITE-NEXT: minsd %xmm0, %xmm1 -; FINITE-NEXT: movaps %xmm1, %xmm0 +; FINITE-NEXT: movapd %xmm1, %xmm0 ; FINITE-NEXT: ret define double @clampTo3k_b(double %x) nounwind readnone { entry: @@ -843,7 +843,7 @@ ; CHECK-LABEL: clampTo3k_c: ; CHECK-NEXT: movsd {{[^,]*}}, %xmm1 ; CHECK-NEXT: maxsd %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: movapd %xmm1, %xmm0 ; CHECK-NEXT: ret ; UNSAFE-LABEL: clampTo3k_c: ; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0 @@ -851,7 +851,7 @@ ; FINITE-LABEL: clampTo3k_c: ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1 ; FINITE-NEXT: maxsd %xmm0, %xmm1 -; FINITE-NEXT: movaps %xmm1, %xmm0 +; FINITE-NEXT: movapd %xmm1, %xmm0 ; FINITE-NEXT: ret define double @clampTo3k_c(double %x) nounwind readnone { entry: @@ -869,7 +869,7 @@ ; FINITE-LABEL: clampTo3k_d: ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1 ; FINITE-NEXT: maxsd %xmm0, %xmm1 -; FINITE-NEXT: movaps %xmm1, %xmm0 +; FINITE-NEXT: movapd %xmm1, %xmm0 ; FINITE-NEXT: ret define double @clampTo3k_d(double %x) nounwind readnone { entry: @@ -881,7 +881,7 @@ ; CHECK-LABEL: clampTo3k_e: ; CHECK-NEXT: movsd {{[^,]*}}, %xmm1 ; CHECK-NEXT: maxsd %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: movapd %xmm1, %xmm0 ; CHECK-NEXT: ret ; UNSAFE-LABEL: clampTo3k_e: ; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0 @@ -889,7 +889,7 @@ ; FINITE-LABEL: clampTo3k_e: ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1 ; FINITE-NEXT: maxsd %xmm0, %xmm1 -; FINITE-NEXT: movaps %xmm1, %xmm0 +; FINITE-NEXT: movapd %xmm1, %xmm0 ; FINITE-NEXT: ret define double @clampTo3k_e(double %x) nounwind readnone { entry: @@ -907,7 +907,7 @@ ; FINITE-LABEL: clampTo3k_f: ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1 ; FINITE-NEXT: maxsd %xmm0, %xmm1 -; FINITE-NEXT: movaps %xmm1, %xmm0 +; FINITE-NEXT: movapd %xmm1, %xmm0 ; FINITE-NEXT: ret define double @clampTo3k_f(double %x) nounwind readnone { entry: @@ -919,7 +919,7 @@ ; CHECK-LABEL: clampTo3k_g: ; CHECK-NEXT: movsd {{[^,]*}}, %xmm1 ; CHECK-NEXT: minsd %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: movapd %xmm1, %xmm0 ; CHECK-NEXT: ret ; UNSAFE-LABEL: clampTo3k_g: ; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0 @@ -927,7 +927,7 @@ ; FINITE-LABEL: clampTo3k_g: ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1 ; FINITE-NEXT: minsd %xmm0, %xmm1 -; FINITE-NEXT: movaps %xmm1, %xmm0 +; FINITE-NEXT: movapd %xmm1, %xmm0 ; FINITE-NEXT: ret define double @clampTo3k_g(double %x) nounwind readnone { entry: @@ -945,7 +945,7 @@ ; FINITE-LABEL: clampTo3k_h: ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1 ; FINITE-NEXT: minsd %xmm0, %xmm1 -; FINITE-NEXT: movaps %xmm1, %xmm0 +; FINITE-NEXT: movapd %xmm1, %xmm0 ; FINITE-NEXT: ret define double @clampTo3k_h(double %x) nounwind readnone { entry: Index: test/CodeGen/X86/sse-scalar-fp-arith.ll =================================================================== --- test/CodeGen/X86/sse-scalar-fp-arith.ll +++ test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -76,6 +76,31 @@ ret <4 x float> %3 } +define <4 x float> @test_sqrt_ss(<4 x float> %a) { +; SSE2-LABEL: test_sqrt_ss: +; SSE2: # BB#0: +; SSE2-NEXT: sqrtss %xmm0, %xmm1 +; SSE2-NEXT: movss %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_sqrt_ss: +; SSE41: # BB#0: +; SSE41-NEXT: sqrtss %xmm0, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: test_sqrt_ss: +; AVX: # BB#0: +; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX-NEXT: retq + %1 = extractelement <4 x float> %a, i32 0 + %2 = call float @llvm.sqrt.f32(float %1) + %3 = insertelement <4 x float> %a, float %2, i32 0 + ret <4 x float> %3 +} +declare float @llvm.sqrt.f32(float) + define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: test_add_sd: ; SSE: # BB#0: @@ -144,6 +169,25 @@ ret <2 x double> %3 } +define <2 x double> @test_sqrt_sd(<2 x double> %a) { +; SSE-LABEL: test_sqrt_sd: +; SSE: # BB#0: +; SSE-NEXT: sqrtsd %xmm0, %xmm1 +; SSE-NEXT: movsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_sqrt_sd: +; AVX: # BB#0: +; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = extractelement <2 x double> %a, i32 0 + %2 = call double @llvm.sqrt.f64(double %1) + %3 = insertelement <2 x double> %a, double %2, i32 0 + ret <2 x double> %3 +} +declare double @llvm.sqrt.f64(double) + define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: test2_add_ss: ; SSE: # BB#0: @@ -220,7 +264,7 @@ ; SSE-LABEL: test2_add_sd: ; SSE: # BB#0: ; SSE-NEXT: addsd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test2_add_sd: @@ -238,7 +282,7 @@ ; SSE-LABEL: test2_sub_sd: ; SSE: # BB#0: ; SSE-NEXT: subsd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test2_sub_sd: @@ -256,7 +300,7 @@ ; SSE-LABEL: test2_mul_sd: ; SSE: # BB#0: ; SSE-NEXT: mulsd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test2_mul_sd: @@ -274,7 +318,7 @@ ; SSE-LABEL: test2_div_sd: ; SSE: # BB#0: ; SSE-NEXT: divsd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test2_div_sd: @@ -371,7 +415,7 @@ } ; With SSE4.1 or greater, the shuffles in the following tests may -; be lowered to X86Blendi nodes. +; be lowered to X86Blendi nodes. define <4 x float> @blend_add_ss(<4 x float> %a, float %b) { ; SSE-LABEL: blend_add_ss: @@ -708,7 +752,7 @@ ; SSE-LABEL: insert_test2_add_sd: ; SSE: # BB#0: ; SSE-NEXT: addsd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: insert_test2_add_sd: @@ -724,7 +768,7 @@ ; SSE-LABEL: insert_test2_sub_sd: ; SSE: # BB#0: ; SSE-NEXT: subsd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: insert_test2_sub_sd: @@ -740,7 +784,7 @@ ; SSE-LABEL: insert_test2_mul_sd: ; SSE: # BB#0: ; SSE-NEXT: mulsd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: insert_test2_mul_sd: @@ -756,7 +800,7 @@ ; SSE-LABEL: insert_test2_div_sd: ; SSE: # BB#0: ; SSE-NEXT: divsd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: insert_test2_div_sd: @@ -956,7 +1000,7 @@ ; SSE-LABEL: insert_test4_add_sd: ; SSE: # BB#0: ; SSE-NEXT: addsd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: insert_test4_add_sd: @@ -972,7 +1016,7 @@ ; SSE-LABEL: insert_test4_sub_sd: ; SSE: # BB#0: ; SSE-NEXT: subsd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: insert_test4_sub_sd: @@ -988,7 +1032,7 @@ ; SSE-LABEL: insert_test4_mul_sd: ; SSE: # BB#0: ; SSE-NEXT: mulsd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: insert_test4_mul_sd: @@ -1004,7 +1048,7 @@ ; SSE-LABEL: insert_test4_div_sd: ; SSE: # BB#0: ; SSE-NEXT: divsd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: insert_test4_div_sd: