Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -188,10 +188,14 @@ "LEA instruction with certain arguments is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; -def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true", - "Use RSQRT* to optimize square root calculations">; -def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst", - "true", "Use RCP* to optimize division calculations">; +def FeatureRecip : SubtargetFeature<"recip-div", "UseRecip", "true", + "Allow scalar reciprocal estimate code generation">; +def FeatureVecRecip : SubtargetFeature<"recip-vec-div", "UseVecRecip", "true", + "Allow vector reciprocal estimate code generation">; +def FeatureRsqrt : SubtargetFeature<"recip-sqrt", "UseRsqrt", "true", + "Allow scalar reciprocal square root code generation">; +def FeatureVecRsqrt : SubtargetFeature<"recip-vec-sqrt", "UseVecRsqrt", "true", + "Allow vector reciprocal square root code generation">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -441,7 +445,8 @@ FeaturePRFCHW, FeatureAES, FeaturePCLMUL, FeatureBMI, FeatureF16C, FeatureMOVBE, FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem, - FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>; + FeatureSlowSHLD, FeatureRecip, FeatureVecRecip, + FeatureRsqrt, FeatureVecRsqrt]>; // TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -12763,13 +12763,6 @@ DAGCombinerInfo &DCI, unsigned &RefinementSteps, bool &UseOneConstNR) const { - // FIXME: We should use instruction latency models to calculate the cost of - // each potential sequence, but this is very hard to do reliably because - // at least Intel's Core* chips have variable timing based on the number of - // significant digits in the divisor and/or sqrt operand. - if (!Subtarget->useSqrtEst()) - return SDValue(); - EVT VT = Op.getValueType(); // SSE1 has rsqrtss and rsqrtps. @@ -12779,8 +12772,14 @@ // instructions: convert to single, rsqrtss, convert back to double, refine // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || - (Subtarget->hasAVX() && VT == MVT::v8f32)) { + bool UseRSQRTSS = + (Subtarget->hasSSE1() && Subtarget->useRsqrtEst() && VT == MVT::f32); + bool UseRSQRTPS = + (Subtarget->hasSSE1() && Subtarget->useVecRsqrtEst() && VT == MVT::v4f32); + bool UseVRSQRTPS = + (Subtarget->hasAVX() && Subtarget->useVecRsqrtEst() && VT == MVT::v8f32); + + if (UseRSQRTSS || UseRSQRTPS || UseVRSQRTPS) { RefinementSteps = 1; UseOneConstNR = false; return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); @@ -12793,13 +12792,6 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, DAGCombinerInfo &DCI, unsigned &RefinementSteps) const { - // FIXME: We should use instruction latency models to calculate the cost of - // each potential sequence, but this is very hard to do reliably because - // at least Intel's Core* chips have variable timing based on the number of - // significant digits in the divisor. - if (!Subtarget->useReciprocalEst()) - return SDValue(); - EVT VT = Op.getValueType(); // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. @@ -12809,8 +12801,14 @@ // 15 instructions: convert to single, rcpss, convert back to double, refine // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || - (Subtarget->hasAVX() && VT == MVT::v8f32)) { + bool UseRCPSS = + (Subtarget->hasSSE1() && Subtarget->useRecipEst() && VT == MVT::f32); + bool UseRCPPS = + (Subtarget->hasSSE1() && Subtarget->useVecRecipEst() && VT == MVT::v4f32); + bool UseVRCPPS = + (Subtarget->hasAVX() && Subtarget->useVecRecipEst() && VT == MVT::v8f32); + + if (UseRCPSS || UseRCPPS || UseVRCPPS) { RefinementSteps = ReciprocalEstimateRefinementSteps; return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); } Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -190,16 +190,13 @@ /// True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; - /// Use the RSQRT* instructions to optimize square root calculations. - /// For this to be profitable, the cost of FSQRT and FDIV must be - /// substantially higher than normal FP ops like FADD and FMUL. - bool UseSqrtEst; - - /// Use the RCP* instructions to optimize FP division calculations. - /// For this to be profitable, the cost of FDIV must be - /// substantially higher than normal FP ops like FADD and FMUL. - bool UseReciprocalEst; - + /// True if estimate instructions (rsqrtss, rsqrtps, rcpss, rcpps) may be + /// used in place of full-precision floating point math. + bool UseRsqrt; + bool UseVecRsqrt; + bool UseRecip; + bool UseVecRecip; + /// Processor has AVX-512 PreFetch Instructions bool HasPFI; @@ -374,8 +371,10 @@ bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } bool slowIncDec() const { return SlowIncDec; } - bool useSqrtEst() const { return UseSqrtEst; } - bool useReciprocalEst() const { return UseReciprocalEst; } + bool useRsqrtEst() const { return UseRsqrt; } + bool useVecRsqrtEst() const { return UseVecRsqrt; } + bool useRecipEst() const { return UseRecip; } + bool useVecRecipEst() const { return UseVecRecip; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -273,8 +273,10 @@ LEAUsesAG = false; SlowLEA = false; SlowIncDec = false; - UseSqrtEst = false; - UseReciprocalEst = false; + UseRsqrt = false; + UseVecRsqrt = false; + UseRecip = false; + UseVecRecip = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; Index: test/CodeGen/X86/recip-fastmath.ll =================================================================== --- test/CodeGen/X86/recip-fastmath.ll +++ test/CodeGen/X86/recip-fastmath.ll @@ -1,109 +1,120 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est | FileCheck %s --check-prefix=RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE - -; If the target's divss/divps instructions are substantially -; slower than rcpss/rcpps with a Newton-Raphson refinement, -; we should generate the estimate sequence. +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,recip-div | FileCheck %s --check-prefix=RECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,recip-vec-div | FileCheck %s --check-prefix=VECRECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,recip-div -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE ; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 ) ; for details about the accuracy, speed, and implementation ; differences of x86 reciprocal estimates. define float @reciprocal_estimate(float %x) #0 { - %div = fdiv fast float 1.0, %x - ret float %div - ; CHECK-LABEL: reciprocal_estimate: -; CHECK: movss -; CHECK-NEXT: divss -; CHECK-NEXT: movaps -; CHECK-NEXT: retq - +; CHECK: # BB#0: +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: divss %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; ; RECIP-LABEL: reciprocal_estimate: -; RECIP: vrcpss -; RECIP: vmulss -; RECIP: vsubss -; RECIP: vmulss -; RECIP: vaddss -; RECIP-NEXT: retq - +; RECIP: # BB#0: +; RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 +; RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 +; RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; RECIP-NEXT: retq +; +; VECRECIP-LABEL: reciprocal_estimate: +; VECRECIP: # BB#0: +; VECRECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; VECRECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; VECRECIP-NEXT: retq +; ; REFINE-LABEL: reciprocal_estimate: -; REFINE: vrcpss -; REFINE: vmulss -; REFINE: vsubss -; REFINE: vmulss -; REFINE: vaddss -; REFINE: vmulss -; REFINE: vsubss -; REFINE: vmulss -; REFINE: vaddss -; REFINE-NEXT: retq +; REFINE: # BB#0: +; REFINE-NEXT: vrcpss %xmm0, %xmm0, %xmm1 +; REFINE-NEXT: vmulss %xmm1, %xmm0, %xmm2 +; REFINE-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; REFINE-NEXT: vsubss %xmm2, %xmm3, %xmm2 +; REFINE-NEXT: vmulss %xmm2, %xmm1, %xmm2 +; REFINE-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; REFINE-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; REFINE-NEXT: vsubss %xmm0, %xmm3, %xmm0 +; REFINE-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; REFINE-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; REFINE-NEXT: retq + %div = fdiv fast float 1.0, %x + ret float %div } define <4 x float> @reciprocal_estimate_v4f32(<4 x float> %x) #0 { - %div = fdiv fast <4 x float> , %x - ret <4 x float> %div - ; CHECK-LABEL: reciprocal_estimate_v4f32: -; CHECK: movaps -; CHECK-NEXT: divps -; CHECK-NEXT: movaps -; CHECK-NEXT: retq - +; CHECK: # BB#0: +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; CHECK-NEXT: divps %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; ; RECIP-LABEL: reciprocal_estimate_v4f32: -; RECIP: vrcpps -; RECIP: vmulps -; RECIP: vsubps -; RECIP: vmulps -; RECIP: vaddps -; RECIP-NEXT: retq - +; RECIP: # BB#0: +; RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; RECIP-NEXT: retq +; +; VECRECIP-LABEL: reciprocal_estimate_v4f32: +; VECRECIP: # BB#0: +; VECRECIP-NEXT: vrcpps %xmm0, %xmm1 +; VECRECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; VECRECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; VECRECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; VECRECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; VECRECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; VECRECIP-NEXT: retq +; ; REFINE-LABEL: reciprocal_estimate_v4f32: -; REFINE: vrcpps -; REFINE: vmulps -; REFINE: vsubps -; REFINE: vmulps -; REFINE: vaddps -; REFINE: vmulps -; REFINE: vsubps -; REFINE: vmulps -; REFINE: vaddps -; REFINE-NEXT: retq +; REFINE: # BB#0: +; REFINE-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; REFINE-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; REFINE-NEXT: retq + %div = fdiv fast <4 x float> , %x + ret <4 x float> %div } define <8 x float> @reciprocal_estimate_v8f32(<8 x float> %x) #0 { - %div = fdiv fast <8 x float> , %x - ret <8 x float> %div - ; CHECK-LABEL: reciprocal_estimate_v8f32: -; CHECK: movaps -; CHECK: movaps -; CHECK-NEXT: divps -; CHECK-NEXT: divps -; CHECK-NEXT: movaps -; CHECK-NEXT: movaps -; CHECK-NEXT: retq - +; CHECK: # BB#0: +; CHECK-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; CHECK-NEXT: movaps %xmm2, %xmm3 +; CHECK-NEXT: divps %xmm0, %xmm3 +; CHECK-NEXT: divps %xmm1, %xmm2 +; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: movaps %xmm2, %xmm1 +; CHECK-NEXT: retq +; ; RECIP-LABEL: reciprocal_estimate_v8f32: -; RECIP: vrcpps -; RECIP: vmulps -; RECIP: vsubps -; RECIP: vmulps -; RECIP: vaddps -; RECIP-NEXT: retq - +; RECIP: # BB#0: +; RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; RECIP-NEXT: retq +; +; VECRECIP-LABEL: reciprocal_estimate_v8f32: +; VECRECIP: # BB#0: +; VECRECIP-NEXT: vrcpps %ymm0, %ymm1 +; VECRECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; VECRECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; VECRECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 +; VECRECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; VECRECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; VECRECIP-NEXT: retq +; ; REFINE-LABEL: reciprocal_estimate_v8f32: -; REFINE: vrcpps -; REFINE: vmulps -; REFINE: vsubps -; REFINE: vmulps -; REFINE: vaddps -; REFINE: vmulps -; REFINE: vsubps -; REFINE: vmulps -; REFINE: vaddps -; REFINE-NEXT: retq +; REFINE: # BB#0: +; REFINE-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; REFINE-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; REFINE-NEXT: retq + %div = fdiv fast <8 x float> , %x + ret <8 x float> %div } attributes #0 = { "unsafe-fp-math"="true" } Index: test/CodeGen/X86/sqrt-fastmath.ll =================================================================== --- test/CodeGen/X86/sqrt-fastmath.ll +++ test/CodeGen/X86/sqrt-fastmath.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-sqrt-est | FileCheck %s --check-prefix=ESTIMATE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,recip-sqrt | FileCheck %s --check-prefix=ESTIMATE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,recip-vec-sqrt | FileCheck %s --check-prefix=VECEST declare double @__sqrt_finite(double) #0 declare float @__sqrtf_finite(float) #0 @@ -19,6 +20,11 @@ ; ESTIMATE: # BB#0: ; ESTIMATE-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ; ESTIMATE-NEXT: retq +; +; VECEST-LABEL: fd: +; VECEST: # BB#0: +; VECEST-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; VECEST-NEXT: retq %call = tail call double @__sqrt_finite(double %d) #1 ret double %call } @@ -43,6 +49,11 @@ ; ESTIMATE-NEXT: vcmpeqss %xmm2, %xmm0, %xmm0 ; ESTIMATE-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; ESTIMATE-NEXT: retq +; +; VECEST-LABEL: ff: +; VECEST: # BB#0: +; VECEST-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; VECEST-NEXT: retq %call = tail call float @__sqrtf_finite(float %f) #1 ret float %call } @@ -60,6 +71,12 @@ ; ESTIMATE-NEXT: fldt {{[0-9]+}}(%rsp) ; ESTIMATE-NEXT: fsqrt ; ESTIMATE-NEXT: retq +; +; VECEST-LABEL: fld: +; VECEST: # BB#0: +; VECEST-NEXT: fldt {{[0-9]+}}(%rsp) +; VECEST-NEXT: fsqrt +; VECEST-NEXT: retq %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #1 ret x86_fp80 %call } @@ -83,6 +100,13 @@ ; ESTIMATE-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 ; ESTIMATE-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; ESTIMATE-NEXT: retq +; +; VECEST-LABEL: reciprocal_square_root: +; VECEST: # BB#0: +; VECEST-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; VECEST-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; VECEST-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; VECEST-NEXT: retq %sqrt = tail call float @llvm.sqrt.f32(float %x) %div = fdiv fast float 1.0, %sqrt ret float %div @@ -98,13 +122,20 @@ ; ; ESTIMATE-LABEL: reciprocal_square_root_v4f32: ; ESTIMATE: # BB#0: -; ESTIMATE-NEXT: vrsqrtps %xmm0, %xmm1 -; ESTIMATE-NEXT: vmulps %xmm1, %xmm1, %xmm2 -; ESTIMATE-NEXT: vmulps %xmm0, %xmm2, %xmm0 -; ESTIMATE-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 -; ESTIMATE-NEXT: vmulps {{.*}}(%rip), %xmm1, %xmm1 -; ESTIMATE-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; ESTIMATE-NEXT: vsqrtps %xmm0, %xmm0 +; ESTIMATE-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; ESTIMATE-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; ESTIMATE-NEXT: retq +; +; VECEST-LABEL: reciprocal_square_root_v4f32: +; VECEST: # BB#0: +; VECEST-NEXT: vrsqrtps %xmm0, %xmm1 +; VECEST-NEXT: vmulps %xmm1, %xmm1, %xmm2 +; VECEST-NEXT: vmulps %xmm0, %xmm2, %xmm0 +; VECEST-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 +; VECEST-NEXT: vmulps {{.*}}(%rip), %xmm1, %xmm1 +; VECEST-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; VECEST-NEXT: retq %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) %div = fdiv fast <4 x float> , %sqrt ret <4 x float> %div @@ -123,13 +154,20 @@ ; ; ESTIMATE-LABEL: reciprocal_square_root_v8f32: ; ESTIMATE: # BB#0: -; ESTIMATE-NEXT: vrsqrtps %ymm0, %ymm1 -; ESTIMATE-NEXT: vmulps %ymm1, %ymm1, %ymm2 -; ESTIMATE-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; ESTIMATE-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 -; ESTIMATE-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 -; ESTIMATE-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; ESTIMATE-NEXT: vsqrtps %ymm0, %ymm0 +; ESTIMATE-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; ESTIMATE-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; ESTIMATE-NEXT: retq +; +; VECEST-LABEL: reciprocal_square_root_v8f32: +; VECEST: # BB#0: +; VECEST-NEXT: vrsqrtps %ymm0, %ymm1 +; VECEST-NEXT: vmulps %ymm1, %ymm1, %ymm2 +; VECEST-NEXT: vmulps %ymm0, %ymm2, %ymm0 +; VECEST-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 +; VECEST-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; VECEST-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; VECEST-NEXT: retq %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x) %div = fdiv fast <8 x float> , %sqrt ret <8 x float> %div