1717#include < boost/math/policies/error_handling.hpp>
1818
1919// For cuda we would rather use builtin nextafter than unsupported boost::math::nextafter
20- // NVRTC does not support the forward declarations header
2120#ifndef BOOST_MATH_HAS_GPU_SUPPORT
2221# include < boost/math/special_functions/next.hpp>
23- # ifndef BOOST_MATH_HAS_NVRTC
24- # include < utility>
25- # include < boost/math/special_functions/math_fwd.hpp>
26- # endif // BOOST_MATH_HAS_NVRTC
22+ # include < boost/math/special_functions/math_fwd.hpp>
23+ # include < utility>
2724#endif // BOOST_MATH_ENABLE_CUDA
2825
2926namespace boost {
@@ -52,6 +49,20 @@ BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y,
5249 return x * y + z;
5350}
5451
52+ #else
53+
54+ template <typename T>
55+ BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma (const T x, const T y, const T z)
56+ {
57+ return ::fma (x, y, z);
58+ }
59+
60+ template <>
61+ BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED float local_fma (const float x, const float y, const float z)
62+ {
63+ return ::fmaf (x, y, z);
64+ }
65+
5566#endif // BOOST_MATH_HAS_GPU_SUPPORT
5667
5768template <typename T, typename Policy>
@@ -71,7 +82,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
7182 if (x == T (-1 ))
7283 {
7384 // pow(+/-0, y)
74- if (boost::math::isfinite (y) && y < 0 )
85+ if (( boost::math::isfinite) (y) && y < 0 )
7586 {
7687 return boost::math::policies::raise_domain_error<T>(function, " Division by 0" , x, pol);
7788 }
@@ -80,7 +91,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
8091 return pow (T (0 ), y);
8192 }
8293
83- if (x == T (-2 ) && boost::math::isinf (y))
94+ if (x == T (-2 ) && ( boost::math::isinf) (y))
8495 {
8596 // pow(-1, +/-inf)
8697 return T (1 );
@@ -92,9 +103,9 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
92103 return T (1 );
93104 }
94105
95- if (boost::math::isinf (y))
106+ if (( boost::math::isinf) (y))
96107 {
97- if (boost::math::signbit (y) == 1 )
108+ if (( boost::math::signbit) (y) == 1 )
98109 {
99110 // pow(x, -inf)
100111 return T (0 );
@@ -106,7 +117,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
106117 }
107118 }
108119
109- if (boost::math::isinf (x))
120+ if (( boost::math::isinf) (x))
110121 {
111122 // pow(+/-inf, y)
112123 return pow (x, y);
@@ -115,11 +126,11 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
115126 // Up to this point, (1+x) = {+/-0, +/-1, +/-inf} have been handled, and
116127 // and y = {+/-0, +/-inf} have been handled. Next, we handle `nan` and
117128 // y = {+/-1}.
118- if (boost::math::isnan (x))
129+ if (( boost::math::isnan) (x))
119130 {
120131 return x;
121132 }
122- else if (boost::math::isnan (y))
133+ else if (( boost::math::isnan) (y))
123134 {
124135 return y;
125136 }
@@ -191,30 +202,46 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
191202 }
192203
193204 // Because x > -1 and s is rounded toward 1, s is guaranteed to be > 0.
194- // Then (1+x)^y == (s+t)^y == (s^y)*((1+u )^y), where u := t / s .
195- // It can be shown that either both terms <= 1 or both >= 1, so
205+ // Write (1+x)^y == (s+t)^y == (s^y)*((1+t/s )^y) == term1*term2 .
206+ // It can be shown that either both terms <= 1 or both >= 1; so
196207 // if the first term over/underflows, then the result over/underflows.
197- T u = t / s;
208+ // And of course, term2 == 1 if t == 0.
198209 T term1 = pow (s, y);
199- if (term1 == T (0 ) || boost::math::isinf (term1))
210+ if (t == T ( 0 ) || term1 == T (0 ) || ( boost::math::isinf) (term1))
200211 {
201212 return term1;
202213 }
203214
204- // (1+u)^y == exp(y*log(1+u)). Since u is close to machine epsilon,
205- // log(1+u) ~= u. Let y*u == z+w, where z is the rounded result and
206- // w is the rounding error. This improves accuracy when y is large.
207- // Then exp(y*u) == exp(z)*exp(w).
208- T z = y * u;
215+ // (1+t/s)^y == exp(y*log(1+t/s)). The relative error of the result is
216+ // equal to the absolute error of the exponent (plus the relative error
217+ // of 'exp'). Therefore, when the exponent is small, it is accurately
218+ // evaluated to machine epsilon using T arithmetic. In addition, since
219+ // t/s <= epsilon, log(1+t/s) is well approximated by t/s to first order.
220+ T u = t / s;
221+ T w = y * u;
222+ if (abs (w) <= T (0.5 ))
223+ {
224+ T term2 = exp (w);
225+ return term1 * term2;
226+ }
227+
228+ // Now y*log(1+t/s) is large, and its relative error is "magnified" by
229+ // the exponent. To reduce the error, we use double-T arithmetic, and
230+ // expand log(1+t/s) to second order.
231+
232+ // (u + uu) ~= t/s.
233+ T r1 = local_fma (-u, s, t);
234+ T uu = r1 / s;
209235
210- #ifdef BOOST_MATH_HAS_GPU_SUPPORT
211- T w = fma (y, u, -z);
212- #else
213- T w = local_fma (y, u, -z);
214- #endif
236+ // (u + vv) ~= log(1+(u+uu)) ~= log(1+t/s).
237+ T vv = local_fma (T (-0.5 )*u, u, uu);
215238
216- T term2 = exp (z) * exp (w);
239+ // (w + ww) ~= y*(u+vv) ~= y*log(1+t/s).
240+ T r2 = local_fma (y, u, -w);
241+ T ww = local_fma (y, vv, r2);
217242
243+ // TODO: maybe ww is small enough such that exp(ww) ~= 1+ww.
244+ T term2 = exp (w) * exp (ww);
218245 return term1 * term2;
219246}
220247
0 commit comments