Skip to content

Commit 8213072

Browse files
author
Weiming Zhao
committedDec 4, 2015
[SimplifyLibCalls] Optimization for pow(x, n) where n is some constant
Summary: In order to avoid calling pow function we generate repeated fmul when n is a positive or negative whole number. For each exponent we pre-compute Addition Chains in order to minimize the no. of fmuls. Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html We pre-compute addition chains for exponents upto 32 (which results in a max of 7 fmuls). For eg: 4 = 2+2 5 = 2+3 6 = 3+3 and so on Hence, pow(x, 4.0) ==> y = fmul x, x x = fmul y, y ret x For negative exponents, we simply compute the reciprocal of the final result. Note: This transformation is only enabled under fast-math. Patch by Mandeep Singh Grang <mgrang@codeaurora.org> Reviewers: weimingz, majnemer, escha, davide, scanon, joerg Subscribers: probinson, escha, llvm-commits Differential Revision: http://reviews.llvm.org/D13994 llvm-svn: 254776
1 parent b51aafd commit 8213072

File tree

2 files changed

+171
-0
lines changed

2 files changed

+171
-0
lines changed
 

‎llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp

+51
Original file line numberDiff line numberDiff line change
@@ -1058,6 +1058,31 @@ Value *LibCallSimplifier::optimizeCos(CallInst *CI, IRBuilder<> &B) {
10581058
return Ret;
10591059
}
10601060

1061+
static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilder<> &B) {
1062+
// Multiplications calculated using Addition Chains.
1063+
// Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
1064+
1065+
assert(Exp != 0 && "Incorrect exponent 0 not handled");
1066+
1067+
if (InnerChain[Exp])
1068+
return InnerChain[Exp];
1069+
1070+
static const unsigned AddChain[33][2] = {
1071+
{0, 0}, // Unused.
1072+
{0, 0}, // Unused (base case = pow1).
1073+
{1, 1}, // Unused (pre-computed).
1074+
{1, 2}, {2, 2}, {2, 3}, {3, 3}, {2, 5}, {4, 4},
1075+
{1, 8}, {5, 5}, {1, 10}, {6, 6}, {4, 9}, {7, 7},
1076+
{3, 12}, {8, 8}, {8, 9}, {2, 16}, {1, 18}, {10, 10},
1077+
{6, 15}, {11, 11}, {3, 20}, {12, 12}, {8, 17}, {13, 13},
1078+
{3, 24}, {14, 14}, {4, 25}, {15, 15}, {3, 28}, {16, 16},
1079+
};
1080+
1081+
InnerChain[Exp] = B.CreateFMul(getPow(InnerChain, AddChain[Exp][0], B),
1082+
getPow(InnerChain, AddChain[Exp][1], B));
1083+
return InnerChain[Exp];
1084+
}
1085+
10611086
Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
10621087
Function *Callee = CI->getCalledFunction();
10631088
Value *Ret = nullptr;
@@ -1156,6 +1181,32 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
11561181
return B.CreateFMul(Op1, Op1, "pow2");
11571182
if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x
11581183
return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip");
1184+
1185+
// In -ffast-math, generate repeated fmul instead of generating pow(x, n).
1186+
if (unsafeFPMath) {
1187+
APFloat V = abs(Op2C->getValueAPF());
1188+
// We limit to a max of 7 fmul(s). Thus max exponent is 32.
1189+
// This transformation applies to integer exponents only.
1190+
if (V.compare(APFloat(V.getSemantics(), 32.0)) == APFloat::cmpGreaterThan ||
1191+
!V.isInteger())
1192+
return nullptr;
1193+
1194+
// We will memoize intermediate products of the Addition Chain.
1195+
Value *InnerChain[33] = {nullptr};
1196+
InnerChain[1] = Op1;
1197+
InnerChain[2] = B.CreateFMul(Op1, Op1);
1198+
1199+
// We cannot readily convert a non-double type (like float) to a double.
1200+
// So we first convert V to something which could be converted to double.
1201+
bool ignored;
1202+
V.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored);
1203+
Value *FMul = getPow(InnerChain, V.convertToDouble(), B);
1204+
// For negative exponents simply compute the reciprocal.
1205+
if (Op2C->isNegative())
1206+
FMul = B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), FMul);
1207+
return FMul;
1208+
}
1209+
11591210
return nullptr;
11601211
}
11611212

+120
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
; Test that the pow library call simplifier works correctly.
2+
3+
; RUN: opt -instcombine -S < %s | FileCheck %s
4+
5+
; Function Attrs: nounwind readnone
6+
declare double @llvm.pow.f64(double, double)
7+
declare float @llvm.pow.f32(float, float)
8+
9+
; pow(x, 4.0f)
10+
define float @test_simplify_4f(float %x) #0 {
11+
; CHECK-LABEL: @test_simplify_4f(
12+
; CHECK-NOT: pow
13+
; CHECK-NEXT: %1 = fmul float %x, %x
14+
; CHECK-NEXT: %2 = fmul float %1, %1
15+
; CHECK-NEXT: ret float %2
16+
%1 = call float @llvm.pow.f32(float %x, float 4.000000e+00)
17+
ret float %1
18+
}
19+
20+
; pow(x, 3.0)
21+
define double @test_simplify_3(double %x) #0 {
22+
; CHECK-LABEL: @test_simplify_3(
23+
; CHECK-NOT: pow
24+
; CHECK-NEXT: %1 = fmul double %x, %x
25+
; CHECK-NEXT: %2 = fmul double %1, %x
26+
; CHECK-NEXT: ret double %2
27+
%1 = call double @llvm.pow.f64(double %x, double 3.000000e+00)
28+
ret double %1
29+
}
30+
31+
; pow(x, 4.0)
32+
define double @test_simplify_4(double %x) #0 {
33+
; CHECK-LABEL: @test_simplify_4(
34+
; CHECK-NOT: pow
35+
; CHECK-NEXT: %1 = fmul double %x, %x
36+
; CHECK-NEXT: %2 = fmul double %1, %1
37+
; CHECK-NEXT: ret double %2
38+
%1 = call double @llvm.pow.f64(double %x, double 4.000000e+00)
39+
ret double %1
40+
}
41+
42+
; pow(x, 15.0)
43+
define double @test_simplify_15(double %x) #0 {
44+
; CHECK-LABEL: @test_simplify_15(
45+
; CHECK-NOT: pow
46+
; CHECK-NEXT: %1 = fmul double %x, %x
47+
; CHECK-NEXT: %2 = fmul double %1, %x
48+
; CHECK-NEXT: %3 = fmul double %2, %2
49+
; CHECK-NEXT: %4 = fmul double %3, %3
50+
; CHECK-NEXT: %5 = fmul double %2, %4
51+
; CHECK-NEXT: ret double %5
52+
%1 = call double @llvm.pow.f64(double %x, double 1.500000e+01)
53+
ret double %1
54+
}
55+
56+
; pow(x, -7.0)
57+
define double @test_simplify_neg_7(double %x) #0 {
58+
; CHECK-LABEL: @test_simplify_neg_7(
59+
; CHECK-NOT: pow
60+
; CHECK-NEXT: %1 = fmul double %x, %x
61+
; CHECK-NEXT: %2 = fmul double %1, %x
62+
; CHECK-NEXT: %3 = fmul double %1, %2
63+
; CHECK-NEXT: %4 = fmul double %1, %3
64+
; CHECK-NEXT: %5 = fdiv double 1.000000e+00, %4
65+
; CHECK-NEXT: ret double %5
66+
%1 = call double @llvm.pow.f64(double %x, double -7.000000e+00)
67+
ret double %1
68+
}
69+
70+
; pow(x, -19.0)
71+
define double @test_simplify_neg_19(double %x) #0 {
72+
; CHECK-LABEL: @test_simplify_neg_19(
73+
; CHECK-NOT: pow
74+
; CHECK-NEXT: %1 = fmul double %x, %x
75+
; CHECK-NEXT: %2 = fmul double %1, %1
76+
; CHECK-NEXT: %3 = fmul double %2, %2
77+
; CHECK-NEXT: %4 = fmul double %3, %3
78+
; CHECK-NEXT: %5 = fmul double %1, %4
79+
; CHECK-NEXT: %6 = fmul double %5, %x
80+
; CHECK-NEXT: %7 = fdiv double 1.000000e+00, %6
81+
; CHECK-NEXT: ret double %7
82+
%1 = call double @llvm.pow.f64(double %x, double -1.900000e+01)
83+
ret double %1
84+
}
85+
86+
; pow(x, 11.23)
87+
define double @test_simplify_11_23(double %x) #0 {
88+
; CHECK-LABEL: @test_simplify_11_23(
89+
; CHECK-NOT: fmul
90+
; CHECK-NEXT: %1 = call double @llvm.pow.f64(double %x, double 1.123000e+01)
91+
; CHECK-NEXT: ret double %1
92+
%1 = call double @llvm.pow.f64(double %x, double 1.123000e+01)
93+
ret double %1
94+
}
95+
96+
; pow(x, 32.0)
97+
define double @test_simplify_32(double %x) #0 {
98+
; CHECK-LABEL: @test_simplify_32(
99+
; CHECK-NOT: pow
100+
; CHECK-NEXT: %1 = fmul double %x, %x
101+
; CHECK-NEXT: %2 = fmul double %1, %1
102+
; CHECK-NEXT: %3 = fmul double %2, %2
103+
; CHECK-NEXT: %4 = fmul double %3, %3
104+
; CHECK-NEXT: %5 = fmul double %4, %4
105+
; CHECK-NEXT: ret double %5
106+
%1 = call double @llvm.pow.f64(double %x, double 3.200000e+01)
107+
ret double %1
108+
}
109+
110+
; pow(x, 33.0)
111+
define double @test_simplify_33(double %x) #0 {
112+
; CHECK-LABEL: @test_simplify_33(
113+
; CHECK-NOT: fmul
114+
; CHECK-NEXT: %1 = call double @llvm.pow.f64(double %x, double 3.300000e+01)
115+
; CHECK-NEXT: ret double %1
116+
%1 = call double @llvm.pow.f64(double %x, double 3.300000e+01)
117+
ret double %1
118+
}
119+
120+
attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="true" "use-soft-float"="false" }

0 commit comments

Comments
 (0)
Please sign in to comment.