Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -6154,12 +6154,7 @@ def : Pat<(f64 (extloadf32 addr:$src)), (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, - Requires<[HasAVX512, OptForSize]>; - -def : Pat<(f64 (extloadf32 addr:$src)), - (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>, - Requires<[HasAVX512, OptForSpeed]>; + Requires<[HasAVX512]>; def : Pat<(f32 (fpround FR64X:$src)), (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), @@ -7188,7 +7183,7 @@ def : Pat<(_.EltVT (OpNode (load addr:$src))), (!cast(NAME#SUFF#Zm) - (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512, OptForSize]>; + (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512]>; } multiclass avx512_sqrt_scalar_all opc, string OpcodeStr> { @@ -7207,15 +7202,14 @@ def : Pat<(f32 (X86frsqrt FR32X:$src)), (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>; def : Pat<(f32 (X86frsqrt (load addr:$src))), - (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, - Requires<[OptForSize]>; + (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>; def : Pat<(f32 (X86frcp FR32X:$src)), (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>; def : Pat<(f32 (X86frcp (load addr:$src))), - (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, - Requires<[OptForSize]>; + (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>; } + multiclass avx512_rndscale_scalar opc, string OpcodeStr, X86VectorVTInfo _> { Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -1806,6 +1806,7 @@ { X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE }, { X86::VCMPSSZrr, X86::VCMPSSZrm, 0 }, { X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE }, + { X86::VCVTSS2SDZrr, X86::VCVTSS2SDZrm, TB_NO_REVERSE }, { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 }, { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 }, { X86::VDIVSDZrr, X86::VDIVSDZrm, 0 }, @@ -1923,8 +1924,12 @@ { X86::VPUNPCKLWDZrr, X86::VPUNPCKLWDZrm, 0 }, { X86::VPXORDZrr, X86::VPXORDZrm, 0 }, { X86::VPXORQZrr, X86::VPXORQZrm, 0 }, + { X86::VRCP14SSrr, X86::VRCP14SSrm, TB_NO_REVERSE }, + { X86::VRSQRT14SSrr, X86::VRSQRT14SSrm, TB_NO_REVERSE }, { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 }, { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 }, + { X86::VSQRTSSZr, X86::VSQRTSSZm, 0 }, + { X86::VSQRTSDZr, X86::VSQRTSDZm, 0 }, { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 }, { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 }, { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 }, Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -1786,7 +1786,7 @@ (ins FR64:$src1, f64mem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], IIC_SSE_CVT_Scalar_RM>, - XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG, + XD, Requires<[HasAVX]>, VEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; } @@ -1852,7 +1852,7 @@ (ins FR32:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], IIC_SSE_CVT_Scalar_RM>, - XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>, + XS, VEX_4V, VEX_LIG, Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>; } @@ -1863,10 +1863,7 @@ def : Pat<(extloadf32 addr:$src), (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[UseAVX, OptForSize]>; -def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, - Requires<[UseAVX, OptForSpeed]>; + Requires<[UseAVX]>; def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", @@ -3457,15 +3454,6 @@ } } - // We don't want to fold scalar loads into these instructions unless - // optimizing for size. This is because the folded instruction will have a - // partial register update, while the unfolded sequence will not, e.g. - // vmovss mem, %xmm0 - // vrcpss %xmm0, %xmm0, %xmm0 - // which has a clobber before the rcp, vs. - // vrcpss mem, %xmm0, %xmm0 - // TODO: In theory, we could fold the load, and avoid the stall caused by - // the partial register store, either in ExeDepFix or with smarter RA. let Predicates = [UseAVX] in { def : Pat<(OpNode RC:$src), (!cast("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)), RC:$src)>; @@ -3475,12 +3463,13 @@ (!cast("V"#NAME#Suffix##r_Int) VR128:$src, VR128:$src)>; } - let Predicates = [HasAVX, OptForSize] in { + + let Predicates = [HasAVX] in { def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))), (!cast("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), addr:$src2)>; } - let Predicates = [UseAVX, OptForSize] in { + let Predicates = [UseAVX] in { def : Pat<(ScalarVT (OpNode (load addr:$src))), (!cast("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)), addr:$src)>; Index: test/CodeGen/X86/avx-arith.ll =================================================================== --- test/CodeGen/X86/avx-arith.ll +++ test/CodeGen/X86/avx-arith.ll @@ -350,8 +350,7 @@ define <4 x float> @int_sqrt_ss() { ; CHECK-LABEL: int_sqrt_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vsqrtss (%rax), %xmm0, %xmm0 ; CHECK-NEXT: retq %x0 = load float, float addrspace(1)* undef, align 8 %x1 = insertelement <4 x float> undef, float %x0, i32 0 Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -600,8 +600,7 @@ define void @fpext() { ; ALL-LABEL: fpext: ; ALL: ## BB#0: ## %entry -; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vcvtss2sd -{{[0-9]+}}(%rsp), %xmm0, %xmm0 ; ALL-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: retq entry: Index: test/CodeGen/X86/fold-load-unops.ll =================================================================== --- test/CodeGen/X86/fold-load-unops.ll +++ test/CodeGen/X86/fold-load-unops.ll @@ -13,8 +13,7 @@ ; ; AVX-LABEL: rcpss: ; AVX: # BB#0: -; AVX-NEXT: vmovss (%rdi), %xmm0 -; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load float, float* %a %ins = insertelement <4 x float> undef, float %ld, i32 0 @@ -32,8 +31,7 @@ ; ; AVX-LABEL: rsqrtss: ; AVX: # BB#0: -; AVX-NEXT: vmovss (%rdi), %xmm0 -; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load float, float* %a %ins = insertelement <4 x float> undef, float %ld, i32 0 @@ -51,8 +49,7 @@ ; ; AVX-LABEL: sqrtss: ; AVX: # BB#0: -; AVX-NEXT: vmovss (%rdi), %xmm0 -; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load float, float* %a %ins = insertelement <4 x float> undef, float %ld, i32 0 @@ -70,8 +67,7 @@ ; ; AVX-LABEL: sqrtsd: ; AVX: # BB#0: -; AVX-NEXT: vmovsd (%rdi), %xmm0 -; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load double, double* %a %ins = insertelement <2 x double> undef, double %ld, i32 0 Index: test/CodeGen/X86/stack-folding-fp-avx512.ll =================================================================== --- test/CodeGen/X86/stack-folding-fp-avx512.ll +++ test/CodeGen/X86/stack-folding-fp-avx512.ll @@ -756,5 +756,51 @@ ret <16 x float> %4 } +define float @stack_fold_vsqrtss(float %a0) { + ;CHECK-LABEL: stack_fold_vsqrtss + ;CHECK: vsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call float @llvm.sqrt.f32(float %a0) + ret float %2 +} +declare float @llvm.sqrt.f32(float %Val) + +define double @stack_fold_vsqrtsd(double %a0) { + ;CHECK-LABEL: stack_fold_vsqrtsd + ;CHECK: vsqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call double @llvm.sqrt.f64(double %a0) + ret double %2 +} +declare double @llvm.sqrt.f64(double %Val) + +define <4 x float> @stack_fold_vrsqrt14ss(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_vrsqrt14ss + ;CHECK: vrsqrt14ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1) ; + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @stack_fold_vrcp14ss(<4 x float> %a0, <4 x float> %a1) { + ; CHECK-LABEL: stack_fold_vrcp14ss + ; CHECK: vrcp14ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1) ; + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <2 x double> @stack_fold_vcvtss2sd(<2 x double> %a0, <4 x float> %a1) { +; CHECK-LABEL: stack_fold_vcvtss2sd +; CHECK: vcvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ext = extractelement <4 x float> %a1, i32 0 + %cvt = fpext float %ext to double + %res = insertelement <2 x double> %a0, double %cvt, i32 0 + ret <2 x double> %res +} + attributes #0 = { "unsafe-fp-math"="false" } attributes #1 = { "unsafe-fp-math"="true" }