Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -6154,12 +6154,7 @@ def : Pat<(f64 (extloadf32 addr:$src)), (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, - Requires<[HasAVX512, OptForSize]>; - -def : Pat<(f64 (extloadf32 addr:$src)), - (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>, - Requires<[HasAVX512, OptForSpeed]>; + Requires<[HasAVX512]>; def : Pat<(f32 (fpround FR64X:$src)), (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), @@ -7185,10 +7180,6 @@ def : Pat<(_.EltVT (OpNode _.FRC:$src)), (!cast(NAME#SUFF#Zr) (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>; - - def : Pat<(_.EltVT (OpNode (load addr:$src))), - (!cast(NAME#SUFF#Zm) - (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512, OptForSize]>; } multiclass avx512_sqrt_scalar_all opc, string OpcodeStr> { @@ -7206,14 +7197,8 @@ let Predicates = [HasAVX512] in { def : Pat<(f32 (X86frsqrt FR32X:$src)), (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>; - def : Pat<(f32 (X86frsqrt (load addr:$src))), - (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, - Requires<[OptForSize]>; def : Pat<(f32 (X86frcp FR32X:$src)), (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>; - def : Pat<(f32 (X86frcp (load addr:$src))), - (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, - Requires<[OptForSize]>; } multiclass Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -1806,6 +1806,7 @@ { X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE }, { X86::VCMPSSZrr, X86::VCMPSSZrm, 0 }, { X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE }, + { X86::VCVTSS2SDZrr, X86::VCVTSS2SDZrm, TB_NO_REVERSE }, { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 }, { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 }, { X86::VDIVSDZrr, X86::VDIVSDZrm, 0 }, @@ -1923,8 +1924,12 @@ { X86::VPUNPCKLWDZrr, X86::VPUNPCKLWDZrm, 0 }, { X86::VPXORDZrr, X86::VPXORDZrm, 0 }, { X86::VPXORQZrr, X86::VPXORQZrm, 0 }, + { X86::VRCP14SSrr, X86::VRCP14SSrm, TB_NO_REVERSE }, + { X86::VRSQRT14SSrr, X86::VRSQRT14SSrm, TB_NO_REVERSE }, { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 }, { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 }, + { X86::VSQRTSSZr, X86::VSQRTSSZm, 0 }, + { X86::VSQRTSDZr, X86::VSQRTSDZm, 0 }, { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 }, { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 }, { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 }, Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -1786,7 +1786,7 @@ (ins FR64:$src1, f64mem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], IIC_SSE_CVT_Scalar_RM>, - XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG, + XD, Requires<[HasAVX]>, VEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; } @@ -1852,7 +1852,7 @@ (ins FR32:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], IIC_SSE_CVT_Scalar_RM>, - XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>, + XS, VEX_4V, VEX_LIG, Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>; } @@ -1863,10 +1863,7 @@ def : Pat<(extloadf32 addr:$src), (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[UseAVX, OptForSize]>; -def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, - Requires<[UseAVX, OptForSpeed]>; + Requires<[UseAVX]>; def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", @@ -3457,30 +3454,23 @@ } } - // We don't want to fold scalar loads into these instructions unless - // optimizing for size. This is because the folded instruction will have a - // partial register update, while the unfolded sequence will not, e.g. - // vmovss mem, %xmm0 - // vrcpss %xmm0, %xmm0, %xmm0 - // which has a clobber before the rcp, vs. - // vrcpss mem, %xmm0, %xmm0 - // TODO: In theory, we could fold the load, and avoid the stall caused by - // the partial register store, either in ExeDepFix or with smarter RA. let Predicates = [UseAVX] in { def : Pat<(OpNode RC:$src), (!cast("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)), RC:$src)>; + } let Predicates = [HasAVX] in { def : Pat<(Intr VR128:$src), (!cast("V"#NAME#Suffix##r_Int) VR128:$src, VR128:$src)>; } - let Predicates = [HasAVX, OptForSize] in { + + let Predicates = [HasAVX] in { def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))), (!cast("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), addr:$src2)>; } - let Predicates = [UseAVX, OptForSize] in { + let Predicates = [UseAVX] in { def : Pat<(ScalarVT (OpNode (load addr:$src))), (!cast("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)), addr:$src)>; Index: test/CodeGen/X86/avx-arith.ll =================================================================== --- test/CodeGen/X86/avx-arith.ll +++ test/CodeGen/X86/avx-arith.ll @@ -350,8 +350,7 @@ define <4 x float> @int_sqrt_ss() { ; CHECK-LABEL: int_sqrt_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vsqrtss (%rax), %xmm0, %xmm0 ; CHECK-NEXT: retq %x0 = load float, float addrspace(1)* undef, align 8 %x1 = insertelement <4 x float> undef, float %x0, i32 0 Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -600,8 +600,7 @@ define void @fpext() { ; ALL-LABEL: fpext: ; ALL: ## BB#0: ## %entry -; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vcvtss2sd -{{[0-9]+}}(%rsp), %xmm0, %xmm0 ; ALL-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: retq entry: Index: test/CodeGen/X86/fold-load-unops.ll =================================================================== --- test/CodeGen/X86/fold-load-unops.ll +++ test/CodeGen/X86/fold-load-unops.ll @@ -13,8 +13,7 @@ ; ; AVX-LABEL: rcpss: ; AVX: # BB#0: -; AVX-NEXT: vmovss (%rdi), %xmm0 -; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load float, float* %a %ins = insertelement <4 x float> undef, float %ld, i32 0 @@ -32,8 +31,7 @@ ; ; AVX-LABEL: rsqrtss: ; AVX: # BB#0: -; AVX-NEXT: vmovss (%rdi), %xmm0 -; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load float, float* %a %ins = insertelement <4 x float> undef, float %ld, i32 0 @@ -51,8 +49,7 @@ ; ; AVX-LABEL: sqrtss: ; AVX: # BB#0: -; AVX-NEXT: vmovss (%rdi), %xmm0 -; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load float, float* %a %ins = insertelement <4 x float> undef, float %ld, i32 0 @@ -70,8 +67,7 @@ ; ; AVX-LABEL: sqrtsd: ; AVX: # BB#0: -; AVX-NEXT: vmovsd (%rdi), %xmm0 -; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load double, double* %a %ins = insertelement <2 x double> undef, double %ld, i32 0