Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -350,6 +350,7 @@ SDValue visitFREM(SDNode *N); SDValue visitFSQRT(SDNode *N); SDValue visitFCOPYSIGN(SDNode *N); + SDValue visitFPOW(SDNode *N); SDValue visitSINT_TO_FP(SDNode *N); SDValue visitUINT_TO_FP(SDNode *N); SDValue visitFP_TO_SINT(SDNode *N); @@ -1568,6 +1569,7 @@ case ISD::FREM: return visitFREM(N); case ISD::FSQRT: return visitFSQRT(N); case ISD::FCOPYSIGN: return visitFCOPYSIGN(N); + case ISD::FPOW: return visitFPOW(N); case ISD::SINT_TO_FP: return visitSINT_TO_FP(N); case ISD::UINT_TO_FP: return visitUINT_TO_FP(N); case ISD::FP_TO_SINT: return visitFP_TO_SINT(N); @@ -11566,6 +11568,45 @@ return SDValue(); } +SDValue DAGCombiner::visitFPOW(SDNode *N) { + ConstantFPSDNode *ExponentC = isConstOrConstSplatFP(N->getOperand(1)); + if (!ExponentC) + return SDValue(); + + // Try to convert x ** (1/4) into square roots. + // x ** (1/2) is canonicalized to sqrt, so we do not bother with that case. + // TODO: This could be extended (using a target hook) to handle smaller + // power-of-2 fractional exponents. + if (ExponentC->getValueAPF().isExactlyValue(0.25)) { + // pow(-0.0, 0.25) = +0.0; sqrt(sqrt(-0.0)) = -0.0. + // pow(-inf, 0.25) = +inf; sqrt(sqrt(-inf)) = NaN. + // For regular numbers, rounding may cause the results to differ. + // Therefore, we require { nsz ninf afn } for this transform. + // TODO: We could select out the special cases if we don't have nsz/ninf. + SDNodeFlags Flags = N->getFlags(); + if (!Flags.hasNoSignedZeros() || !Flags.hasNoInfs() || + !Flags.hasApproximateFuncs()) + return SDValue(); + + // Don't double the number of libcalls. We are trying to inline fast code. + EVT VT = N->getValueType(0); + if (!DAG.getTargetLoweringInfo().isOperationLegalOrCustom(ISD::FSQRT, VT)) + return SDValue(); + + // Assume that libcalls are the smallest code. + // TODO: This restriction should probably be lifted for vectors. + if (DAG.getMachineFunction().getFunction().optForSize()) + return SDValue(); + + // pow(X, 0.25) --> sqrt(sqrt(X)) + SDLoc DL(N); + SDValue Sqrt = DAG.getNode(ISD::FSQRT, DL, VT, N->getOperand(0), Flags); + return DAG.getNode(ISD::FSQRT, DL, VT, Sqrt, Flags); + } + + return SDValue(); +} + static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI) { // This optimization is guarded by a function attribute because it may produce Index: test/CodeGen/AArch64/pow.ll =================================================================== --- test/CodeGen/AArch64/pow.ll +++ test/CodeGen/AArch64/pow.ll @@ -10,8 +10,9 @@ define float @pow_f32_one_fourth_fmf(float %x) nounwind { ; CHECK-LABEL: pow_f32_one_fourth_fmf: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov s1, #0.25000000 -; CHECK-NEXT: b powf +; CHECK-NEXT: fsqrt s0, s0 +; CHECK-NEXT: fsqrt s0, s0 +; CHECK-NEXT: ret %r = call nsz ninf afn float @llvm.pow.f32(float %x, float 2.5e-01) ret float %r } @@ -19,8 +20,9 @@ define double @pow_f64_one_fourth_fmf(double %x) nounwind { ; CHECK-LABEL: pow_f64_one_fourth_fmf: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov d1, #0.25000000 -; CHECK-NEXT: b pow +; CHECK-NEXT: fsqrt d0, d0 +; CHECK-NEXT: fsqrt d0, d0 +; CHECK-NEXT: ret %r = call nsz ninf afn double @llvm.pow.f64(double %x, double 2.5e-01) ret double %r } @@ -28,42 +30,8 @@ define <4 x float> @pow_v4f32_one_fourth_fmf(<4 x float> %x) nounwind { ; CHECK-LABEL: pow_v4f32_one_fourth_fmf: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #48 // =48 -; CHECK-NEXT: str d8, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: fmov s8, #0.25000000 -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov s0, v0.s[1] -; CHECK-NEXT: mov v1.16b, v8.16b -; CHECK-NEXT: str x30, [sp, #40] // 8-byte Folded Spill -; CHECK-NEXT: bl powf -; CHECK-NEXT: str d0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov v1.16b, v8.16b -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-NEXT: bl powf -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: mov v0.s[1], v1.s[0] -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov v1.16b, v8.16b -; CHECK-NEXT: mov s0, v0.s[2] -; CHECK-NEXT: bl powf -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: mov v1.s[2], v0.s[0] -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov v1.16b, v8.16b -; CHECK-NEXT: mov s0, v0.s[3] -; CHECK-NEXT: bl powf -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload -; CHECK-NEXT: ldr d8, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: mov v1.s[3], v0.s[0] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: add sp, sp, #48 // =48 +; CHECK-NEXT: fsqrt v0.4s, v0.4s +; CHECK-NEXT: fsqrt v0.4s, v0.4s ; CHECK-NEXT: ret %r = call fast <4 x float> @llvm.pow.v4f32(<4 x float> %x, <4 x float> ) ret <4 x float> %r @@ -72,25 +40,8 @@ define <2 x double> @pow_v2f64_one_fourth_fmf(<2 x double> %x) nounwind { ; CHECK-LABEL: pow_v2f64_one_fourth_fmf: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #48 // =48 -; CHECK-NEXT: str d8, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: fmov d8, #0.25000000 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov d0, v0.d[1] -; CHECK-NEXT: mov v1.16b, v8.16b -; CHECK-NEXT: str x30, [sp, #40] // 8-byte Folded Spill -; CHECK-NEXT: bl pow -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov v1.16b, v8.16b -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: bl pow -; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload -; CHECK-NEXT: ldr d8, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NEXT: add sp, sp, #48 // =48 +; CHECK-NEXT: fsqrt v0.2d, v0.2d +; CHECK-NEXT: fsqrt v0.2d, v0.2d ; CHECK-NEXT: ret %r = call fast <2 x double> @llvm.pow.v2f64(<2 x double> %x, <2 x double> ) ret <2 x double> %r Index: test/CodeGen/ARM/pow.ll =================================================================== --- test/CodeGen/ARM/pow.ll +++ test/CodeGen/ARM/pow.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi | FileCheck %s --check-prefixes=ANY,SOFTFLOAT +; RUN: llc < %s -mtriple=thumbv8-linux-gnueabihf -mattr=neon | FileCheck %s --check-prefixes=ANY,HARDFLOAT + +declare float @llvm.pow.f32(float, float) +declare <4 x float> @llvm.pow.v4f32(<4 x float>, <4 x float>) + +declare double @llvm.pow.f64(double, double) +declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>) + +define float @pow_f32_one_fourth_fmf(float %x) nounwind { +; ANY-LABEL: pow_f32_one_fourth_fmf: +; SOFTFLOAT: bl powf +; HARDFLOAT: vsqrt.f32 +; HARDFLOAT: vsqrt.f32 + %r = call nsz ninf afn float @llvm.pow.f32(float %x, float 2.5e-01) + ret float %r +} + +define double @pow_f64_one_fourth_fmf(double %x) nounwind { +; ANY-LABEL: pow_f64_one_fourth_fmf: +; SOFTFLOAT: bl pow +; HARDFLOAT: vsqrt.f64 +; HARDFLOAT: vsqrt.f64 + %r = call nsz ninf afn double @llvm.pow.f64(double %x, double 2.5e-01) + ret double %r +} + +define <4 x float> @pow_v4f32_one_fourth_fmf(<4 x float> %x) nounwind { +; ANY-LABEL: pow_v4f32_one_fourth_fmf: +; SOFTFLOAT: bl powf +; SOFTFLOAT: bl powf +; SOFTFLOAT: bl powf +; SOFTFLOAT: bl powf +; HARDFLOAT: vsqrt.f32 +; HARDFLOAT: vsqrt.f32 +; HARDFLOAT: vsqrt.f32 +; HARDFLOAT: vsqrt.f32 +; HARDFLOAT: vsqrt.f32 +; HARDFLOAT: vsqrt.f32 +; HARDFLOAT: vsqrt.f32 +; HARDFLOAT: vsqrt.f32 + %r = call fast <4 x float> @llvm.pow.v4f32(<4 x float> %x, <4 x float> ) + ret <4 x float> %r +} + +define <2 x double> @pow_v2f64_one_fourth_fmf(<2 x double> %x) nounwind { +; ANY-LABEL: pow_v2f64_one_fourth_fmf: +; SOFTFLOAT: bl pow +; SOFTFLOAT: bl pow +; HARDFLOAT: vsqrt.f64 +; HARDFLOAT: vsqrt.f64 +; HARDFLOAT: vsqrt.f64 +; HARDFLOAT: vsqrt.f64 + %r = call fast <2 x double> @llvm.pow.v2f64(<2 x double> %x, <2 x double> ) + ret <2 x double> %r +} + +define float @pow_f32_one_fourth_not_enough_fmf(float %x) nounwind { +; ANY-LABEL: pow_f32_one_fourth_not_enough_fmf: +; SOFTFLOAT: bl powf +; HARDFLOAT: b powf + %r = call afn ninf float @llvm.pow.f32(float %x, float 2.5e-01) + ret float %r +} + +define double @pow_f64_one_fourth_not_enough_fmf(double %x) nounwind { +; ANY-LABEL: pow_f64_one_fourth_not_enough_fmf: +; SOFTFLOAT: bl pow +; HARDFLOAT: b pow + %r = call nsz ninf double @llvm.pow.f64(double %x, double 2.5e-01) + ret double %r +} + +define <4 x float> @pow_v4f32_one_fourth_not_enough_fmf(<4 x float> %x) nounwind { +; ANY-LABEL: pow_v4f32_one_fourth_not_enough_fmf: +; ANY: bl powf +; ANY: bl powf +; ANY: bl powf +; ANY: bl powf + %r = call afn nsz <4 x float> @llvm.pow.v4f32(<4 x float> %x, <4 x float> ) + ret <4 x float> %r +} + +define <2 x double> @pow_v2f64_one_fourth_not_enough_fmf(<2 x double> %x) nounwind { +; ANY-LABEL: pow_v2f64_one_fourth_not_enough_fmf: +; ANY: bl pow +; ANY: bl pow + %r = call nsz nnan reassoc <2 x double> @llvm.pow.v2f64(<2 x double> %x, <2 x double> ) + ret <2 x double> %r +} + Index: test/CodeGen/X86/pow.ll =================================================================== --- test/CodeGen/X86/pow.ll +++ test/CodeGen/X86/pow.ll @@ -10,8 +10,30 @@ define float @pow_f32_one_fourth_fmf(float %x) nounwind { ; CHECK-LABEL: pow_f32_one_fourth_fmf: ; CHECK: # %bb.0: -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: jmp powf # TAILCALL +; CHECK-NEXT: rsqrtss %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: mulss %xmm1, %xmm2 +; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm2, %xmm4 +; CHECK-NEXT: mulss %xmm3, %xmm4 +; CHECK-NEXT: mulss %xmm1, %xmm2 +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: addss %xmm1, %xmm2 +; CHECK-NEXT: mulss %xmm4, %xmm2 +; CHECK-NEXT: xorps %xmm4, %xmm4 +; CHECK-NEXT: cmpeqss %xmm4, %xmm0 +; CHECK-NEXT: andnps %xmm2, %xmm0 +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: rsqrtss %xmm0, %xmm2 +; CHECK-NEXT: movaps %xmm0, %xmm5 +; CHECK-NEXT: mulss %xmm2, %xmm5 +; CHECK-NEXT: mulss %xmm5, %xmm3 +; CHECK-NEXT: mulss %xmm2, %xmm5 +; CHECK-NEXT: addss %xmm1, %xmm5 +; CHECK-NEXT: mulss %xmm3, %xmm5 +; CHECK-NEXT: cmpeqss %xmm4, %xmm0 +; CHECK-NEXT: andnps %xmm5, %xmm0 +; CHECK-NEXT: retq %r = call nsz ninf afn float @llvm.pow.f32(float %x, float 2.5e-01) ret float %r } @@ -19,8 +41,9 @@ define double @pow_f64_one_fourth_fmf(double %x) nounwind { ; CHECK-LABEL: pow_f64_one_fourth_fmf: ; CHECK: # %bb.0: -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: jmp pow # TAILCALL +; CHECK-NEXT: sqrtsd %xmm0, %xmm0 +; CHECK-NEXT: sqrtsd %xmm0, %xmm0 +; CHECK-NEXT: retq %r = call nsz ninf afn double @llvm.pow.f64(double %x, double 2.5e-01) ret double %r } @@ -28,33 +51,28 @@ define <4 x float> @pow_v4f32_one_fourth_fmf(<4 x float> %x) nounwind { ; CHECK-LABEL: pow_v4f32_one_fourth_fmf: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: callq powf -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: callq powf -; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: callq powf -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: callq powf -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] -; CHECK-NEXT: movaps %xmm1, %xmm0 -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: rsqrtps %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: mulps %xmm1, %xmm2 +; CHECK-NEXT: movaps {{.*#+}} xmm3 = [-5.000000e-01,-5.000000e-01,-5.000000e-01,-5.000000e-01] +; CHECK-NEXT: movaps %xmm2, %xmm4 +; CHECK-NEXT: mulps %xmm3, %xmm4 +; CHECK-NEXT: mulps %xmm1, %xmm2 +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [-3.000000e+00,-3.000000e+00,-3.000000e+00,-3.000000e+00] +; CHECK-NEXT: addps %xmm1, %xmm2 +; CHECK-NEXT: mulps %xmm4, %xmm2 +; CHECK-NEXT: xorps %xmm4, %xmm4 +; CHECK-NEXT: cmpneqps %xmm4, %xmm0 +; CHECK-NEXT: andps %xmm2, %xmm0 +; CHECK-NEXT: rsqrtps %xmm0, %xmm2 +; CHECK-NEXT: movaps %xmm0, %xmm5 +; CHECK-NEXT: mulps %xmm2, %xmm5 +; CHECK-NEXT: mulps %xmm5, %xmm3 +; CHECK-NEXT: mulps %xmm2, %xmm5 +; CHECK-NEXT: addps %xmm1, %xmm5 +; CHECK-NEXT: mulps %xmm3, %xmm5 +; CHECK-NEXT: cmpneqps %xmm4, %xmm0 +; CHECK-NEXT: andps %xmm5, %xmm0 ; CHECK-NEXT: retq %r = call fast <4 x float> @llvm.pow.v4f32(<4 x float> %x, <4 x float> ) ret <4 x float> %r @@ -63,19 +81,8 @@ define <2 x double> @pow_v2f64_one_fourth_fmf(<2 x double> %x) nounwind { ; CHECK-LABEL: pow_v2f64_one_fourth_fmf: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq pow -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq pow -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, %xmm0 -; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: sqrtpd %xmm0, %xmm0 +; CHECK-NEXT: sqrtpd %xmm0, %xmm0 ; CHECK-NEXT: retq %r = call fast <2 x double> @llvm.pow.v2f64(<2 x double> %x, <2 x double> ) ret <2 x double> %r