Index: lib/Transforms/Utils/SimplifyLibCalls.cpp =================================================================== --- lib/Transforms/Utils/SimplifyLibCalls.cpp +++ lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -1061,6 +1061,46 @@ return Ret; } +static Value *getPow(Value *pow1, unsigned Exp, IRBuilder<> &B) { + // Pre-compute Addition Chains for each exponent upto 32 + // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html + + Value *AddChain[33]; + AddChain[1] = pow1; + AddChain[2] = B.CreateFMul(pow1, pow1); + AddChain[3] = B.CreateFMul(AddChain[1], AddChain[2]); + AddChain[4] = B.CreateFMul(AddChain[2], AddChain[2]); + AddChain[5] = B.CreateFMul(AddChain[2], AddChain[3]); + AddChain[6] = B.CreateFMul(AddChain[3], AddChain[3]); + AddChain[7] = B.CreateFMul(AddChain[2], AddChain[5]); + AddChain[8] = B.CreateFMul(AddChain[4], AddChain[4]); + AddChain[9] = B.CreateFMul(AddChain[1], AddChain[8]); + AddChain[10] = B.CreateFMul(AddChain[5], AddChain[5]); + AddChain[11] = B.CreateFMul(AddChain[1], AddChain[10]); + AddChain[12] = B.CreateFMul(AddChain[6], AddChain[6]); + AddChain[13] = B.CreateFMul(AddChain[4], AddChain[9]); + AddChain[14] = B.CreateFMul(AddChain[7], AddChain[7]); + AddChain[15] = B.CreateFMul(AddChain[3], AddChain[12]); + AddChain[16] = B.CreateFMul(AddChain[8], AddChain[8]); + AddChain[17] = B.CreateFMul(AddChain[8], AddChain[9]); + AddChain[18] = B.CreateFMul(AddChain[2], AddChain[16]); + AddChain[19] = B.CreateFMul(AddChain[1], AddChain[18]); + AddChain[20] = B.CreateFMul(AddChain[10], AddChain[10]); + AddChain[21] = B.CreateFMul(AddChain[6], AddChain[15]); + AddChain[22] = B.CreateFMul(AddChain[11], AddChain[11]); + AddChain[23] = B.CreateFMul(AddChain[3], AddChain[20]); + AddChain[24] = B.CreateFMul(AddChain[12], AddChain[12]); + AddChain[25] = B.CreateFMul(AddChain[8], AddChain[17]); + AddChain[26] = B.CreateFMul(AddChain[13], AddChain[13]); + AddChain[27] = B.CreateFMul(AddChain[3], AddChain[24]); + AddChain[28] = B.CreateFMul(AddChain[14], AddChain[14]); + AddChain[29] = B.CreateFMul(AddChain[4], AddChain[25]); + AddChain[30] = B.CreateFMul(AddChain[15], AddChain[15]); + AddChain[31] = B.CreateFMul(AddChain[3], AddChain[28]); + AddChain[32] = B.CreateFMul(AddChain[16], AddChain[16]); + return AddChain[Exp]; +} + Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); Value *Ret = nullptr; @@ -1154,6 +1194,27 @@ return B.CreateFMul(Op1, Op1, "pow2"); if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip"); + + // Only in fast-math mode, generate repeated fmul instead of generating + // pow(x, n). + if (canUseUnsafeFPMath(CI->getParent()->getParent())) { + APFloat V = abs(Op2C->getValueAPF()); + // We limit to a max of 7 fmul(s). Thus max exponent is 32. + // This transformation applies to integer exponents only. + if (V.compare(APFloat(V.getSemantics(), 32.0)) == + APFloat::cmpGreaterThan || !V.isInteger()) + return nullptr; + // We cannot readily convert a non-double type (like float) to a double. + // So we first convert V to something which could be converted to double. + bool ignored; + V.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored); + Value *FMul = getPow(Op1, V.convertToDouble(), B); + // For negative exponents simply compute the reciprocal. + if (Op2C->isNegative()) + FMul = B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), FMul); + return FMul; + } + return nullptr; } Index: test/Transforms/InstCombine/pow-4.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/pow-4.ll @@ -0,0 +1,120 @@ +; Test that the pow library call simplifier works correctly. + +; RUN: opt -instcombine -S < %s | FileCheck %s --check-prefix=CHECK + +; Function Attrs: nounwind readnone +declare double @llvm.pow.f64(double, double) +declare float @llvm.pow.f32(float, float) + +; pow(x, 4.0f) +define float @test_simplify_4f(float %x) #0 { +; CHECK-LABEL: @test_simplify_4f( +; CHECK-NOT: pow +; CHECK-NEXT: %1 = fmul float %x, %x +; CHECK-NEXT: %2 = fmul float %1, %1 +; CHECK-NEXT: ret float %2 + %1 = call float @llvm.pow.f32(float %x, float 4.000000e+00) + ret float %1 +} + +; pow(x, 3.0) +define double @test_simplify_3(double %x) #0 { +; CHECK-LABEL: @test_simplify_3( +; CHECK-NOT: pow +; CHECK-NEXT: %1 = fmul double %x, %x +; CHECK-NEXT: %2 = fmul double %1, %x +; CHECK-NEXT: ret double %2 + %1 = call double @llvm.pow.f64(double %x, double 3.000000e+00) + ret double %1 +} + +; pow(x, 4.0) +define double @test_simplify_4(double %x) #0 { +; CHECK-LABEL: @test_simplify_4( +; CHECK-NOT: pow +; CHECK-NEXT: %1 = fmul double %x, %x +; CHECK-NEXT: %2 = fmul double %1, %1 +; CHECK-NEXT: ret double %2 + %1 = call double @llvm.pow.f64(double %x, double 4.000000e+00) + ret double %1 +} + +; pow(x, 15.0) +define double @test_simplify_15(double %x) #0 { +; CHECK-LABEL: @test_simplify_15( +; CHECK-NOT: pow +; CHECK-NEXT: %1 = fmul double %x, %x +; CHECK-NEXT: %2 = fmul double %1, %x +; CHECK-NEXT: %3 = fmul double %2, %2 +; CHECK-NEXT: %4 = fmul double %3, %3 +; CHECK-NEXT: %5 = fmul double %2, %4 +; CHECK-NEXT: ret double %5 + %1 = call double @llvm.pow.f64(double %x, double 1.500000e+01) + ret double %1 +} + +; pow(x, -7.0) +define double @test_simplify_neg_7(double %x) #0 { +; CHECK-LABEL: @test_simplify_neg_7( +; CHECK-NOT: pow +; CHECK-NEXT: %1 = fmul double %x, %x +; CHECK-NEXT: %2 = fmul double %1, %x +; CHECK-NEXT: %3 = fmul double %1, %2 +; CHECK-NEXT: %4 = fmul double %1, %3 +; CHECK-NEXT: %5 = fdiv double 1.000000e+00, %4 +; CHECK-NEXT: ret double %5 + %1 = call double @llvm.pow.f64(double %x, double -7.000000e+00) + ret double %1 +} + +; pow(x, -19.0) +define double @test_simplify_neg_19(double %x) #0 { +; CHECK-LABEL: @test_simplify_neg_19( +; CHECK-NOT: pow +; CHECK-NEXT: %1 = fmul double %x, %x +; CHECK-NEXT: %2 = fmul double %1, %1 +; CHECK-NEXT: %3 = fmul double %2, %2 +; CHECK-NEXT: %4 = fmul double %3, %3 +; CHECK-NEXT: %5 = fmul double %1, %4 +; CHECK-NEXT: %6 = fmul double %5, %x +; CHECK-NEXT: %7 = fdiv double 1.000000e+00, %6 +; CHECK-NEXT: ret double %7 + %1 = call double @llvm.pow.f64(double %x, double -1.900000e+01) + ret double %1 +} + +; pow(x, 11.23) +define double @test_simplify_11_23(double %x) #0 { +; CHECK-LABEL: @test_simplify_11_23( +; CHECK-NOT: fmul +; CHECK-NEXT: %1 = call double @llvm.pow.f64(double %x, double 1.123000e+01) +; CHECK-NEXT: ret double %1 + %1 = call double @llvm.pow.f64(double %x, double 1.123000e+01) + ret double %1 +} + +; pow(x, 32.0) +define double @test_simplify_32(double %x) #0 { +; CHECK-LABEL: @test_simplify_32( +; CHECK-NOT: pow +; CHECK-NEXT: %1 = fmul double %x, %x +; CHECK-NEXT: %2 = fmul double %1, %1 +; CHECK-NEXT: %3 = fmul double %2, %2 +; CHECK-NEXT: %4 = fmul double %3, %3 +; CHECK-NEXT: %5 = fmul double %4, %4 +; CHECK-NEXT: ret double %5 + %1 = call double @llvm.pow.f64(double %x, double 3.200000e+01) + ret double %1 +} + +; pow(x, 33.0) +define double @test_simplify_33(double %x) #0 { +; CHECK-LABEL: @test_simplify_33( +; CHECK-NOT: fmul +; CHECK-NEXT: %1 = call double @llvm.pow.f64(double %x, double 3.300000e+01) +; CHECK-NEXT: ret double %1 + %1 = call double @llvm.pow.f64(double %x, double 3.300000e+01) + ret double %1 +} + +attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="true" "use-soft-float"="false" }