From df002cc927953613b233321bf4a29c3d0e3920ef Mon Sep 17 00:00:00 2001 From: oscarddssmith Date: Fri, 7 Apr 2023 15:50:59 -0400 Subject: [PATCH 1/3] better sin_sum --- src/besselj.jl | 46 +++++++----------------- src/bessely.jl | 52 +++++++++------------------- src/misc.jl | 94 +++++++++++++++++++++++++++++++------------------- 3 files changed, 87 insertions(+), 105 deletions(-) diff --git a/src/besselj.jl b/src/besselj.jl index 34da648..3c3928b 100644 --- a/src/besselj.jl +++ b/src/besselj.jl @@ -59,21 +59,11 @@ function _besselj0(x::Float64) q2 = (-1/8, 25/384, -1073/5120, 375733/229376) p = evalpoly(x2, p2) q = evalpoly(x2, q2) - if x > 1e15 - a = SQ2OPI(T) * sqrt(xinv) * p - xn = muladd(xinv, q, -PIO4(T)) - s_x, c_x = sincos(x) - s_xn, c_xn = sincos(xn) - return a * (c_x * c_xn - s_x * s_xn) - end end a = SQ2OPI(T) * sqrt(xinv) * p - xn = muladd(xinv, q, -PIO4(T)) - - # the following computes b = cos(x + xn) more accurately - # see src/misc.jl - b = cos_sum(x, xn) + xn = xinv*q + b = sin_sum((x,), (pi/4, xn)) return a * b end end @@ -137,21 +127,11 @@ function _besselj1(x::Float64) q2 = (3/8, -21/128, 1899/5120, -543483/229376) p = evalpoly(x2, p2) q = evalpoly(x2, q2) - if x > 1e15 - a = SQ2OPI(T) * sqrt(xinv) * p - xn = muladd(xinv, q, -3 * PIO4(T)) - s_x, c_x = sincos(x) - s_xn, c_xn = sincos(xn) - return s * a * (c_x * c_xn - s_x * s_xn) - end end a = SQ2OPI(T) * sqrt(xinv) * p - xn = muladd(xinv, q, -3 * PIO4(T)) - - # the following computes b = cos(x + xn) more accurately - # see src/misc.jl - b = cos_sum(x, xn) + xn = xinv*q + b = sin_sum((x,), (-pi/4, xn)) return a * b * s end end @@ -182,7 +162,7 @@ end ##### besselj0(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli0(z/im) -besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im +besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im # Bessel functions of the first kind of order nu @@ -208,17 +188,17 @@ besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im # # For values where the expansions for large arguments and orders are not valid, backward recurrence is employed after shifting the order up # to where `besseljy_debye` is accurate then using downward recurrence. In general, the routine will be the slowest when ν ≈ x as all methods struggle at this point. -# +# # [1] http://dlmf.nist.gov/10.2.E2 -# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments." +# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments." # Advances in Computational Mathematics 45.1 (2019): 173-211. -# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions." +# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions." # Applied and Computational Harmonic Analysis 1.1 (1993): 116-135. -# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime. +# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime. # Applied and Computational Harmonic Analysis, 39(2), 347-356. -# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method." +# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method." # Computer physics communications 76.3 (1993): 381-388. -# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables. +# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables. # Vol. 55. US Government printing office, 1964. # @@ -432,7 +412,7 @@ end # Cutoff for Float32 determined from using Float64 precision down to eps(Float32) besselj_series_cutoff(v, x::Float32) = (x < 20.0) || v > (14.4 + x*(-0.455 + 0.027*x)) besselj_series_cutoff(v, x::Float64) = (x < 7.0) || v > (2 + x*(0.109 + 0.062*x)) -# cutoff for Float128 for ~1e-35 relative error +# cutoff for Float128 for ~1e-35 relative error #besselj_series_cutoff(v, x::AbstractFloat) = (x < 4.0) || v > (x*(0.08 + 0.12*x)) #= @@ -467,7 +447,7 @@ end # On the other hand, shifting the order down avoids any concern about underflow for large orders # Shifting the order too high while keeping x fixed could result in numerical underflow # Therefore we need to shift up only until the `besseljy_debye` is accurate and need to test that no underflow occurs -# Shifting the order up decreases the value substantially for high orders and results +# Shifting the order up decreases the value substantially for high orders and results # in a stable forward recurrence as the values rapidly increase function besselj_recurrence(nu, x) # shift order up to where expansions are valid see src/U_polynomials.jl diff --git a/src/bessely.jl b/src/bessely.jl index fd50753..c172392 100644 --- a/src/bessely.jl +++ b/src/bessely.jl @@ -64,7 +64,7 @@ function _bessely0_compute(x::Float64) z = w * w p = evalpoly(z, PP_y0(T)) / evalpoly(z, PQ_y0(T)) q = evalpoly(z, QP_y0(T)) / evalpoly(z, QQ_y0(T)) - xn = x - PIO4(T) + xn = x - pi/4 sc = sincos(xn) p = p * sc[1] + w * q * sc[2] return p * SQ2OPI(T) / sqrt(x) @@ -81,21 +81,11 @@ function _bessely0_compute(x::Float64) q2 = (-1/8, 25/384, -1073/5120, 375733/229376) p = evalpoly(x2, p2) q = evalpoly(x2, q2) - if x > 1e15 - a = SQ2OPI(T) * sqrt(xinv) * p - xn = muladd(xinv, q, -PIO4(T)) - s_x, c_x = sincos(x) - s_xn, c_xn = sincos(xn) - return a * (s_x * c_xn + c_x * s_xn) - end end a = SQ2OPI(T) * sqrt(xinv) * p - xn = muladd(xinv, q, -PIO4(T)) - - # the following computes b = sin(x + xn) more accurately - # see src/misc.jl - b = sin_sum(x, xn) + xn = xinv*q + b = sin_sum((x,), (-pi/4, xn)) return a * b end end @@ -170,21 +160,11 @@ function _bessely1_compute(x::Float64) q2 = (3/8, -21/128, 1899/5120, -543483/229376) p = evalpoly(x2, p2) q = evalpoly(x2, q2) - if x > 1e15 - a = SQ2OPI(T) * sqrt(xinv) * p - xn = muladd(xinv, q, - 3 * PIO4(T)) - s_x, c_x = sincos(x) - s_xn, c_xn = sincos(xn) - return a * (s_x * c_xn + c_x * s_xn) - end end a = SQ2OPI(T) * sqrt(xinv) * p - xn = muladd(xinv, q, - 3 * PIO4(T)) - - # the following computes b = sin(x + xn) more accurately - # see src/misc.jl - b = sin_sum(x, xn) + xn = xinv*q + b = sin_sum((-x,), (-pi/4, -xn)) return a * b end end @@ -225,7 +205,7 @@ end # for positive arguments and orders. For integer orders up to 250, forward recurrence is used starting from # `bessely0` and `bessely1` routines for calculation of Y_{n}(x) of the zero and first order. # For small arguments, Y_{ν}(x) is calculated from the power series (`bessely_power_series(nu, x`) form of J_{ν}(x) using the connection formula [1]. -# +# # When x < ν and ν being reasonably large, the debye asymptotic expansion (Eq. 33; [3]) is used `besseljy_debye(nu, x)`. # When x > ν and x being reasonably large, the Hankel function is calculated from the debye expansion (Eq. 29; [3]) with `hankel_debye(nu, x)` # and Y_{n}(x) is calculated from the imaginary part of the Hankel function. @@ -234,20 +214,20 @@ end # # For values where the expansions for large arguments and orders are not valid, forward recurrence is employed after shifting the order down # to where these expansions are valid then using recurrence. In general, the routine will be the slowest when ν ≈ x as all methods struggle at this point. -# Additionally, the Hankel expansion is only accurate (to Float64 precision) when x > 19 and the power series can only be computed for x < 7 +# Additionally, the Hankel expansion is only accurate (to Float64 precision) when x > 19 and the power series can only be computed for x < 7 # without loss of precision. Therefore, when x > 19 the Hankel expansion is used to generate starting values after shifting the order down. # When x ∈ (7, 19) and ν ∈ (0, 2) Chebyshev approximation is used with higher orders filled by recurrence. -# +# # [1] https://dlmf.nist.gov/10.2#E3 -# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments." +# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments." # Advances in Computational Mathematics 45.1 (2019): 173-211. -# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions." +# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions." # Applied and Computational Harmonic Analysis 1.1 (1993): 116-135. -# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime. +# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime. # Applied and Computational Harmonic Analysis, 39(2), 347-356. -# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method." +# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method." # Computer physics communications 76.3 (1993): 381-388. -# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables. +# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables. # Vol. 55. US Government printing office, 1964. # @@ -389,7 +369,7 @@ No checks on arguments are performed and should only be called if certain nu, x """ function bessely_positive_args(nu, x::T) where T iszero(x) && return -T(Inf) - + # use forward recurrence if nu is an integer up until it becomes inefficient (isinteger(nu) && nu < 250) && return besselj_up_recurrence(x, bessely1(x), bessely0(x), 1, nu)[1] @@ -418,7 +398,7 @@ end # combined to calculate both J_v and J_{-v} in the same loop # J_{-v} always converges slower so just check that convergence # Combining the loop was faster than using two separate loops -# +# # this works well for small arguments x < 7.0 for rel. error ~1e-14 # this also works well for nu > 1.35x - 4.5 # for nu > 25 more cancellation occurs near integer values @@ -575,7 +555,7 @@ function log_bessely_power_series(v, x::T) where T logout2 = -v * log(x/2) + log(out2) spi, cpi = sincospi(v) - + #tmp = logout2 + log(abs(sign*(inv(spi)) - exp(logout - logout2) * cpi / spi)) tmp = logout + log((-cpi + exp(logout2) - logout) / spi) diff --git a/src/misc.jl b/src/misc.jl index 6be8c72..08014f9 100644 --- a/src/misc.jl +++ b/src/misc.jl @@ -1,47 +1,69 @@ -# function to more accurately compute cos(x + xn) -# see https://github.com/heltonmc/Bessels.jl/pull/13 -# written by @oscardssmith -function cos_sum(x, xn) - s = x + xn - n, r = reduce_pi02_med(s) - lo = r.lo - ((s - x) - xn) - hi = r.hi + lo - y = Base.Math.DoubleFloat64(hi, r.hi-hi+lo) +using Base.Math: sin_kernel, cos_kernel, rem_pio2_kernel, DoubleFloat64, DoubleFloat32 +function sin_double(n,y) n = n&3 if n == 0 - return Base.Math.cos_kernel(y) + return sin_kernel(y) elseif n == 1 - return -Base.Math.sin_kernel(y) + return cos_kernel(y) elseif n == 2 - return -Base.Math.cos_kernel(y) + return -sin_kernel(y) else - return Base.Math.sin_kernel(y) + return -cos_kernel(y) end end -# function to more accurately compute sin(x + xn) -function sin_sum(x, xn) - s = x + xn - n, r = reduce_pi02_med(s) - lo = r.lo - ((s - x) - xn) - hi = r.hi + lo - y = Base.Math.DoubleFloat64(hi, r.hi-hi+lo) - n = n&3 - if n == 0 - return Base.Math.sin_kernel(y) - elseif n == 1 - return Base.Math.cos_kernel(y) - elseif n == 2 - return -Base.Math.sin_kernel(y) - else - return -Base.Math.cos_kernel(y) + +""" + computes sin(sum(xs)+sum(xlos)) where xs, xlos are sorted by absolute value + and xlos are all <=pi/4 (and therefore don't need to be reduced) + Doing this is much more accurate than the naive sin(sum(xs)+sum(xlos)) +""" +function sin_sum(xs::Tuple{Vararg{Float64}}, xlos::Tuple{Vararg{Float64}}) + n = 0 + hi, lo = 0.0, 0.0 + for x in xs + n_i, y = rem_pio2_kernel(x) + n += n_i + s = y.hi + hi + lo += (y.hi - (s - hi)) + y.lo + hi = s + end + for x in xlos + s = x + hi + lo += (x - (s - hi)) + hi = s end + while hi > pi/4 + hi -= pi/2 + lo -= 6.123233995736766e-17 + n += 1 + end + while hi < -pi/4 + hi += pi/2 + lo += 6.123233995736766e-17 + n -= 1 + end + sin_double(n,DoubleFloat64(hi, lo)) end -@inline function reduce_pi02_med(x::Float64) - pio2_1 = 1.57079632673412561417e+00 - fn = round(x*(2/pi)) - r = muladd(-fn, pio2_1, x) - w = fn * 6.07710050650619224932e-11 - y = r-w - return unsafe_trunc(Int, fn), Base.Math.DoubleFloat64(y, (r-y)-w) +function sin_sum(xs::Tuple{Vararg{Float32}}, xlos::Tuple{Vararg{Float32}}) + n = 0 + y = 0.0 + for x in xs + n_i, y_i = rem_pio2_kernel(x) + n += n_i + y += y_i.hi + end + for x in xlos + y += x + end + while y > pi/4 + y -= pi/2 + n += 1 + end + while y < -pi/4 + y += pi/2 + n -= 1 + end + + sin_double(n,DoubleFloat32(y)) end From 5f1c77f6feef20fd0fdaae414085a06effe2da67 Mon Sep 17 00:00:00 2001 From: oscarddssmith Date: Sun, 9 Apr 2023 17:31:29 -0400 Subject: [PATCH 2/3] sinsum improvements --- src/besselj.jl | 4 +-- src/bessely.jl | 4 +-- src/misc.jl | 71 ++++++++++++++++++++++++++++---------------------- 3 files changed, 44 insertions(+), 35 deletions(-) diff --git a/src/besselj.jl b/src/besselj.jl index 3c3928b..32eb9bb 100644 --- a/src/besselj.jl +++ b/src/besselj.jl @@ -63,7 +63,7 @@ function _besselj0(x::Float64) a = SQ2OPI(T) * sqrt(xinv) * p xn = xinv*q - b = sin_sum((x,), (pi/4, xn)) + b = sin_sum(x, pi/4, xn) return a * b end end @@ -131,7 +131,7 @@ function _besselj1(x::Float64) a = SQ2OPI(T) * sqrt(xinv) * p xn = xinv*q - b = sin_sum((x,), (-pi/4, xn)) + b = sin_sum(x, -pi/4, xn) return a * b * s end end diff --git a/src/bessely.jl b/src/bessely.jl index c172392..a66e4c1 100644 --- a/src/bessely.jl +++ b/src/bessely.jl @@ -85,7 +85,7 @@ function _bessely0_compute(x::Float64) a = SQ2OPI(T) * sqrt(xinv) * p xn = xinv*q - b = sin_sum((x,), (-pi/4, xn)) + b = sin_sum(x, -pi/4, xn) return a * b end end @@ -164,7 +164,7 @@ function _bessely1_compute(x::Float64) a = SQ2OPI(T) * sqrt(xinv) * p xn = xinv*q - b = sin_sum((-x,), (-pi/4, -xn)) + b = sin_sum(-x, -pi/4, -xn) return a * b end end diff --git a/src/misc.jl b/src/misc.jl index 08014f9..97f0c96 100644 --- a/src/misc.jl +++ b/src/misc.jl @@ -1,6 +1,12 @@ -using Base.Math: sin_kernel, cos_kernel, rem_pio2_kernel, DoubleFloat64, DoubleFloat32 -function sin_double(n,y) - n = n&3 +using Base.Math: sin_kernel, cos_kernel, sincos_kernel, rem_pio2_kernel, DoubleFloat64, DoubleFloat32 + +""" + computes sin(sum(xs)) where xs are sorted by absolute value + Doing this is much more accurate than the naive sin(sum(xs)) +""" +function sin_sum(xs::Vararg{T})::T where T<:Base.IEEEFloat + n, y = rem_pio2_sum(xs...) + n &= 3 if n == 0 return sin_kernel(y) elseif n == 1 @@ -13,21 +19,42 @@ function sin_double(n,y) end """ - computes sin(sum(xs)+sum(xlos)) where xs, xlos are sorted by absolute value - and xlos are all <=pi/4 (and therefore don't need to be reduced) - Doing this is much more accurate than the naive sin(sum(xs)+sum(xlos)) + computes sincos(sum(xs)) where xs are sorted by absolute value + Doing this is much more accurate than the naive sincos(sum(xs)) """ -function sin_sum(xs::Tuple{Vararg{Float64}}, xlos::Tuple{Vararg{Float64}}) +function sincos_sum(xs::Vararg{T})::T where T<:Base.IEEEFloat + n, y = rem_pio2_sum(xs...) + n &= 3 + si, co = sincos_kernel(y) + if n == 0 + return si, co + elseif n == 1 + return co, -si + elseif n == 2 + return -si, -co + else + return -co, si + end +end + +function rem_pio2_sum(xs::Vararg{Float64}) n = 0 hi, lo = 0.0, 0.0 - for x in xs + small_start = length(xs)+1 + for i in eachindex(xs) + x = xs[i] + if abs(x) <= pi/4 + small_start = i + break + end n_i, y = rem_pio2_kernel(x) n += n_i s = y.hi + hi lo += (y.hi - (s - hi)) + y.lo hi = s end - for x in xlos + for i in small_start:length(xs) + x = xs[i] s = x + hi lo += (x - (s - hi)) hi = s @@ -42,28 +69,10 @@ function sin_sum(xs::Tuple{Vararg{Float64}}, xlos::Tuple{Vararg{Float64}}) lo += 6.123233995736766e-17 n -= 1 end - sin_double(n,DoubleFloat64(hi, lo)) + return n, DoubleFloat64(hi, lo) end -function sin_sum(xs::Tuple{Vararg{Float32}}, xlos::Tuple{Vararg{Float32}}) - n = 0 - y = 0.0 - for x in xs - n_i, y_i = rem_pio2_kernel(x) - n += n_i - y += y_i.hi - end - for x in xlos - y += x - end - while y > pi/4 - y -= pi/2 - n += 1 - end - while y < -pi/4 - y += pi/2 - n -= 1 - end - - sin_double(n,DoubleFloat32(y)) +function rem_pio2_sum(xs::Vararg{Union{Float32,Float64}}) + n, y = rem_pio2_kernel(sum(Float64, xs)) + return n, DoubleFloat32(y.hi) end From daf86ceef0d792b53141bbbb859dbe011d857ad9 Mon Sep 17 00:00:00 2001 From: oscarddssmith Date: Sun, 9 Apr 2023 22:08:21 -0400 Subject: [PATCH 3/3] improve Float32 rem_pio2_sum --- src/misc.jl | 48 +++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/src/misc.jl b/src/misc.jl index 97f0c96..b456d81 100644 --- a/src/misc.jl +++ b/src/misc.jl @@ -40,23 +40,16 @@ end function rem_pio2_sum(xs::Vararg{Float64}) n = 0 hi, lo = 0.0, 0.0 - small_start = length(xs)+1 - for i in eachindex(xs) - x = xs[i] + for x in xs if abs(x) <= pi/4 - small_start = i - break + s = x + hi + lo += (x - (s - hi)) + else + n_i, y = rem_pio2_kernel(x) + n += n_i + s = y.hi + hi + lo += (y.hi - (s - hi)) + y.lo end - n_i, y = rem_pio2_kernel(x) - n += n_i - s = y.hi + hi - lo += (y.hi - (s - hi)) + y.lo - hi = s - end - for i in small_start:length(xs) - x = xs[i] - s = x + hi - lo += (x - (s - hi)) hi = s end while hi > pi/4 @@ -72,7 +65,28 @@ function rem_pio2_sum(xs::Vararg{Float64}) return n, DoubleFloat64(hi, lo) end -function rem_pio2_sum(xs::Vararg{Union{Float32,Float64}}) - n, y = rem_pio2_kernel(sum(Float64, xs)) +function rem_pio2_sum(xs::Vararg{Float32}) + y = 0.0 + n = 0 + # The minimum cosine or sine of any Float32 that gets reduced is 1.6e-9 + # so reducing at 2^22 prevents catastrophic loss of precision. + # There probably is a case where this loses some digits but it is a decent + # tradeoff between accuracy and speed. + @fastmath for x in xs + if x > 0x1p22 + n_i, y_i = rem_pio2_kernel(Float32(x)) + n += n_i + y += y_i.hi + else + y += x + end + end + n_i, y = rem_pio2_kernel(y) + return n + n_i, DoubleFloat32(y.hi) +end + +function rem_pio2_sum(xs::Vararg{Float16}) + y = sum(Float64, xs) #Float16 can be losslessly accumulated in Float64 + n, y = rem_pio2_kernel(y) return n, DoubleFloat32(y.hi) end