Skip to content

Commit 1a450eb

Browse files
committed
Simplify FMA, fix concept fails, and sync from upstream
1 parent 866d9d0 commit 1a450eb

File tree

1 file changed

+54
-27
lines changed
  • include/boost/math/special_functions

1 file changed

+54
-27
lines changed

include/boost/math/special_functions/pow1p.hpp

Lines changed: 54 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,10 @@
1717
#include <boost/math/policies/error_handling.hpp>
1818

1919
// For cuda we would rather use builtin nextafter than unsupported boost::math::nextafter
20-
// NVRTC does not support the forward declarations header
2120
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
2221
# include <boost/math/special_functions/next.hpp>
23-
# ifndef BOOST_MATH_HAS_NVRTC
24-
# include <utility>
25-
# include <boost/math/special_functions/math_fwd.hpp>
26-
# endif // BOOST_MATH_HAS_NVRTC
22+
# include <boost/math/special_functions/math_fwd.hpp>
23+
# include <utility>
2724
#endif // BOOST_MATH_ENABLE_CUDA
2825

2926
namespace boost {
@@ -52,6 +49,20 @@ BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y,
5249
return x * y + z;
5350
}
5451

52+
#else
53+
54+
template <typename T>
55+
BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y, const T z)
56+
{
57+
return ::fma(x, y, z);
58+
}
59+
60+
template <>
61+
BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED float local_fma(const float x, const float y, const float z)
62+
{
63+
return ::fmaf(x, y, z);
64+
}
65+
5566
#endif // BOOST_MATH_HAS_GPU_SUPPORT
5667

5768
template <typename T, typename Policy>
@@ -71,7 +82,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
7182
if (x == T(-1))
7283
{
7384
// pow(+/-0, y)
74-
if (boost::math::isfinite(y) && y < 0)
85+
if ((boost::math::isfinite)(y) && y < 0)
7586
{
7687
return boost::math::policies::raise_domain_error<T>(function, "Division by 0", x, pol);
7788
}
@@ -80,7 +91,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
8091
return pow(T(0), y);
8192
}
8293

83-
if (x == T(-2) && boost::math::isinf(y))
94+
if (x == T(-2) && (boost::math::isinf)(y))
8495
{
8596
// pow(-1, +/-inf)
8697
return T(1);
@@ -92,9 +103,9 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
92103
return T(1);
93104
}
94105

95-
if (boost::math::isinf(y))
106+
if ((boost::math::isinf)(y))
96107
{
97-
if (boost::math::signbit(y) == 1)
108+
if ((boost::math::signbit)(y) == 1)
98109
{
99110
// pow(x, -inf)
100111
return T(0);
@@ -106,7 +117,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
106117
}
107118
}
108119

109-
if (boost::math::isinf(x))
120+
if ((boost::math::isinf)(x))
110121
{
111122
// pow(+/-inf, y)
112123
return pow(x, y);
@@ -115,11 +126,11 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
115126
// Up to this point, (1+x) = {+/-0, +/-1, +/-inf} have been handled, and
116127
// and y = {+/-0, +/-inf} have been handled. Next, we handle `nan` and
117128
// y = {+/-1}.
118-
if (boost::math::isnan(x))
129+
if ((boost::math::isnan)(x))
119130
{
120131
return x;
121132
}
122-
else if (boost::math::isnan(y))
133+
else if ((boost::math::isnan)(y))
123134
{
124135
return y;
125136
}
@@ -191,30 +202,46 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
191202
}
192203

193204
// Because x > -1 and s is rounded toward 1, s is guaranteed to be > 0.
194-
// Then (1+x)^y == (s+t)^y == (s^y)*((1+u)^y), where u := t / s.
195-
// It can be shown that either both terms <= 1 or both >= 1, so
205+
// Write (1+x)^y == (s+t)^y == (s^y)*((1+t/s)^y) == term1*term2.
206+
// It can be shown that either both terms <= 1 or both >= 1; so
196207
// if the first term over/underflows, then the result over/underflows.
197-
T u = t / s;
208+
// And of course, term2 == 1 if t == 0.
198209
T term1 = pow(s, y);
199-
if (term1 == T(0) || boost::math::isinf(term1))
210+
if (t == T(0) || term1 == T(0) || (boost::math::isinf)(term1))
200211
{
201212
return term1;
202213
}
203214

204-
// (1+u)^y == exp(y*log(1+u)). Since u is close to machine epsilon,
205-
// log(1+u) ~= u. Let y*u == z+w, where z is the rounded result and
206-
// w is the rounding error. This improves accuracy when y is large.
207-
// Then exp(y*u) == exp(z)*exp(w).
208-
T z = y * u;
215+
// (1+t/s)^y == exp(y*log(1+t/s)). The relative error of the result is
216+
// equal to the absolute error of the exponent (plus the relative error
217+
// of 'exp'). Therefore, when the exponent is small, it is accurately
218+
// evaluated to machine epsilon using T arithmetic. In addition, since
219+
// t/s <= epsilon, log(1+t/s) is well approximated by t/s to first order.
220+
T u = t / s;
221+
T w = y * u;
222+
if (abs(w) <= T(0.5))
223+
{
224+
T term2 = exp(w);
225+
return term1 * term2;
226+
}
227+
228+
// Now y*log(1+t/s) is large, and its relative error is "magnified" by
229+
// the exponent. To reduce the error, we use double-T arithmetic, and
230+
// expand log(1+t/s) to second order.
231+
232+
// (u + uu) ~= t/s.
233+
T r1 = local_fma(-u, s, t);
234+
T uu = r1 / s;
209235

210-
#ifdef BOOST_MATH_HAS_GPU_SUPPORT
211-
T w = fma(y, u, -z);
212-
#else
213-
T w = local_fma(y, u, -z);
214-
#endif
236+
// (u + vv) ~= log(1+(u+uu)) ~= log(1+t/s).
237+
T vv = local_fma(T(-0.5)*u, u, uu);
215238

216-
T term2 = exp(z) * exp(w);
239+
// (w + ww) ~= y*(u+vv) ~= y*log(1+t/s).
240+
T r2 = local_fma(y, u, -w);
241+
T ww = local_fma(y, vv, r2);
217242

243+
// TODO: maybe ww is small enough such that exp(ww) ~= 1+ww.
244+
T term2 = exp(w) * exp(ww);
218245
return term1 * term2;
219246
}
220247

0 commit comments

Comments
 (0)