Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -1131,9 +1131,13 @@ { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 }, { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 }, { X86::VRCPSSr, X86::VRCPSSm, 0 }, + { X86::VRCPSSr_Int, X86::VRCPSSm_Int, 0 }, { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, + { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, 0 }, { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, + { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, 0 }, { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, + { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, 0 }, { X86::VADDPDrr, X86::VADDPDrm, 0 }, { X86::VADDPSrr, X86::VADDPSrm, 0 }, { X86::VADDSDrr, X86::VADDSDrm, 0 }, Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -3372,17 +3372,23 @@ OpndItins itins> { let Predicates = [HasAVX], hasSideEffects = 0 in { def V#NAME#SSr : SSI, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; + (ins FR32:$src1, FR32:$src2), + !strconcat("v", OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; + let isCodeGenOnly = 1 in + def V#NAME#SSr_Int : SSI, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; let mayLoad = 1 in { def V#NAME#SSm : SSI, VEX_4V, VEX_LIG, - Sched<[itins.Sched.Folded, ReadAfterLd]>; + Sched<[itins.Sched.Folded, ReadAfterLd]>; let isCodeGenOnly = 1 in def V#NAME#SSm_Int : SSI, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; + let isCodeGenOnly = 1 in + def V#NAME#SDr_Int : SDI, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; let mayLoad = 1 in { def V#NAME#SDm : SDI; + (VSQRTSSr_Int (v4f32 (IMPLICIT_DEF)), VR128:$src)>; def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src), (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; def : Pat<(int_x86_sse2_sqrt_sd VR128:$src), - (COPY_TO_REGCLASS (VSQRTSDr (f64 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128:$src, FR64)), - VR128)>; + (VSQRTSDr_Int (v2f64 (IMPLICIT_DEF)), VR128:$src)>; def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src), (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>; } let Predicates = [HasAVX] in { def : Pat<(int_x86_sse_rsqrt_ss VR128:$src), - (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128:$src, FR32)), - VR128)>; + (VRSQRTSSr_Int (v4f32 (IMPLICIT_DEF)), VR128:$src)>; def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src), (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; def : Pat<(int_x86_sse_rcp_ss VR128:$src), - (COPY_TO_REGCLASS (VRCPSSr (f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128:$src, FR32)), - VR128)>; + (VRCPSSr_Int (v4f32 (IMPLICIT_DEF)), VR128:$src)>; def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src), (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; } Index: test/CodeGen/X86/fold-load-unops.ll =================================================================== --- test/CodeGen/X86/fold-load-unops.ll +++ test/CodeGen/X86/fold-load-unops.ll @@ -0,0 +1,54 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s + +; Verify that we're folding the load into the math instruction. + +define float @rcpss(float* %a) #0 { +; CHECK-LABEL: rcpss: +; CHECK: vrcpss (%rdi), %xmm0, %xmm0 + + %ld = load float* %a + %ins = insertelement <4 x float> undef, float %ld, i32 0 + %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins) nounwind + %ext = extractelement <4 x float> %res, i32 0 + ret float %ext +} + +define float @rsqrtss(float* %a) #0 { +; CHECK-LABEL: rsqrtss: +; CHECK: vrsqrtss (%rdi), %xmm0, %xmm0 + + %ld = load float* %a + %ins = insertelement <4 x float> undef, float %ld, i32 0 + %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins) nounwind + %ext = extractelement <4 x float> %res, i32 0 + ret float %ext +} + +define float @sqrtss(float* %a) #0 { +; CHECK-LABEL: sqrtss: +; CHECK: vsqrtss (%rdi), %xmm0, %xmm0 + + %ld = load float* %a + %ins = insertelement <4 x float> undef, float %ld, i32 0 + %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins) nounwind + %ext = extractelement <4 x float> %res, i32 0 + ret float %ext +} + +define double @sqrtsd(double* %a) #0 { +; CHECK-LABEL: sqrtsd: +; CHECK: vsqrtsd (%rdi), %xmm0, %xmm0 + + %ld = load double* %a + %ins = insertelement <2 x double> undef, double %ld, i32 0 + %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins) nounwind + %ext = extractelement <2 x double> %res, i32 0 + ret double %ext +} + + +declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone +