Skip to content

Commit

Permalink
Merge pull request #88 from JuliaMath/better-sin_sum
Browse files Browse the repository at this point in the history
better sin_sum
  • Loading branch information
heltonmc authored Apr 11, 2023
2 parents 6a3eed7 + daf86ce commit 4afbc6c
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 102 deletions.
46 changes: 13 additions & 33 deletions src/besselj.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,21 +59,11 @@ function _besselj0(x::Float64)
q2 = (-1/8, 25/384, -1073/5120, 375733/229376)
p = evalpoly(x2, p2)
q = evalpoly(x2, q2)
if x > 1e15
a = SQ2OPI(T) * sqrt(xinv) * p
xn = muladd(xinv, q, -PIO4(T))
s_x, c_x = sincos(x)
s_xn, c_xn = sincos(xn)
return a * (c_x * c_xn - s_x * s_xn)
end
end

a = SQ2OPI(T) * sqrt(xinv) * p
xn = muladd(xinv, q, -PIO4(T))

# the following computes b = cos(x + xn) more accurately
# see src/misc.jl
b = cos_sum(x, xn)
xn = xinv*q
b = sin_sum(x, pi/4, xn)
return a * b
end
end
Expand Down Expand Up @@ -137,21 +127,11 @@ function _besselj1(x::Float64)
q2 = (3/8, -21/128, 1899/5120, -543483/229376)
p = evalpoly(x2, p2)
q = evalpoly(x2, q2)
if x > 1e15
a = SQ2OPI(T) * sqrt(xinv) * p
xn = muladd(xinv, q, -3 * PIO4(T))
s_x, c_x = sincos(x)
s_xn, c_xn = sincos(xn)
return s * a * (c_x * c_xn - s_x * s_xn)
end
end

a = SQ2OPI(T) * sqrt(xinv) * p
xn = muladd(xinv, q, -3 * PIO4(T))

# the following computes b = cos(x + xn) more accurately
# see src/misc.jl
b = cos_sum(x, xn)
xn = xinv*q
b = sin_sum(x, -pi/4, xn)
return a * b * s
end
end
Expand Down Expand Up @@ -182,7 +162,7 @@ end
#####

besselj0(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli0(z/im)
besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im
besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im


# Bessel functions of the first kind of order nu
Expand All @@ -208,17 +188,17 @@ besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im
#
# For values where the expansions for large arguments and orders are not valid, backward recurrence is employed after shifting the order up
# to where `besseljy_debye` is accurate then using downward recurrence. In general, the routine will be the slowest when ν ≈ x as all methods struggle at this point.
#
#
# [1] http://dlmf.nist.gov/10.2.E2
# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments."
# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments."
# Advances in Computational Mathematics 45.1 (2019): 173-211.
# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions."
# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions."
# Applied and Computational Harmonic Analysis 1.1 (1993): 116-135.
# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime.
# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime.
# Applied and Computational Harmonic Analysis, 39(2), 347-356.
# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method."
# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method."
# Computer physics communications 76.3 (1993): 381-388.
# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables.
# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables.
# Vol. 55. US Government printing office, 1964.
#

Expand Down Expand Up @@ -432,7 +412,7 @@ end
# Cutoff for Float32 determined from using Float64 precision down to eps(Float32)
besselj_series_cutoff(v, x::Float32) = (x < 20.0) || v > (14.4 + x*(-0.455 + 0.027*x))
besselj_series_cutoff(v, x::Float64) = (x < 7.0) || v > (2 + x*(0.109 + 0.062*x))
# cutoff for Float128 for ~1e-35 relative error
# cutoff for Float128 for ~1e-35 relative error
#besselj_series_cutoff(v, x::AbstractFloat) = (x < 4.0) || v > (x*(0.08 + 0.12*x))

#=
Expand Down Expand Up @@ -467,7 +447,7 @@ end
# On the other hand, shifting the order down avoids any concern about underflow for large orders
# Shifting the order too high while keeping x fixed could result in numerical underflow
# Therefore we need to shift up only until the `besseljy_debye` is accurate and need to test that no underflow occurs
# Shifting the order up decreases the value substantially for high orders and results
# Shifting the order up decreases the value substantially for high orders and results
# in a stable forward recurrence as the values rapidly increase
function besselj_recurrence(nu, x)
# shift order up to where expansions are valid see src/U_polynomials.jl
Expand Down
52 changes: 16 additions & 36 deletions src/bessely.jl
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ function _bessely0_compute(x::Float64)
z = w * w
p = evalpoly(z, PP_y0(T)) / evalpoly(z, PQ_y0(T))
q = evalpoly(z, QP_y0(T)) / evalpoly(z, QQ_y0(T))
xn = x - PIO4(T)
xn = x - pi/4
sc = sincos(xn)
p = p * sc[1] + w * q * sc[2]
return p * SQ2OPI(T) / sqrt(x)
Expand All @@ -81,21 +81,11 @@ function _bessely0_compute(x::Float64)
q2 = (-1/8, 25/384, -1073/5120, 375733/229376)
p = evalpoly(x2, p2)
q = evalpoly(x2, q2)
if x > 1e15
a = SQ2OPI(T) * sqrt(xinv) * p
xn = muladd(xinv, q, -PIO4(T))
s_x, c_x = sincos(x)
s_xn, c_xn = sincos(xn)
return a * (s_x * c_xn + c_x * s_xn)
end
end

a = SQ2OPI(T) * sqrt(xinv) * p
xn = muladd(xinv, q, -PIO4(T))

# the following computes b = sin(x + xn) more accurately
# see src/misc.jl
b = sin_sum(x, xn)
xn = xinv*q
b = sin_sum(x, -pi/4, xn)
return a * b
end
end
Expand Down Expand Up @@ -170,21 +160,11 @@ function _bessely1_compute(x::Float64)
q2 = (3/8, -21/128, 1899/5120, -543483/229376)
p = evalpoly(x2, p2)
q = evalpoly(x2, q2)
if x > 1e15
a = SQ2OPI(T) * sqrt(xinv) * p
xn = muladd(xinv, q, - 3 * PIO4(T))
s_x, c_x = sincos(x)
s_xn, c_xn = sincos(xn)
return a * (s_x * c_xn + c_x * s_xn)
end
end

a = SQ2OPI(T) * sqrt(xinv) * p
xn = muladd(xinv, q, - 3 * PIO4(T))

# the following computes b = sin(x + xn) more accurately
# see src/misc.jl
b = sin_sum(x, xn)
xn = xinv*q
b = sin_sum(-x, -pi/4, -xn)
return a * b
end
end
Expand Down Expand Up @@ -225,7 +205,7 @@ end
# for positive arguments and orders. For integer orders up to 250, forward recurrence is used starting from
# `bessely0` and `bessely1` routines for calculation of Y_{n}(x) of the zero and first order.
# For small arguments, Y_{ν}(x) is calculated from the power series (`bessely_power_series(nu, x`) form of J_{ν}(x) using the connection formula [1].
#
#
# When x < ν and ν being reasonably large, the debye asymptotic expansion (Eq. 33; [3]) is used `besseljy_debye(nu, x)`.
# When x > ν and x being reasonably large, the Hankel function is calculated from the debye expansion (Eq. 29; [3]) with `hankel_debye(nu, x)`
# and Y_{n}(x) is calculated from the imaginary part of the Hankel function.
Expand All @@ -234,20 +214,20 @@ end
#
# For values where the expansions for large arguments and orders are not valid, forward recurrence is employed after shifting the order down
# to where these expansions are valid then using recurrence. In general, the routine will be the slowest when ν ≈ x as all methods struggle at this point.
# Additionally, the Hankel expansion is only accurate (to Float64 precision) when x > 19 and the power series can only be computed for x < 7
# Additionally, the Hankel expansion is only accurate (to Float64 precision) when x > 19 and the power series can only be computed for x < 7
# without loss of precision. Therefore, when x > 19 the Hankel expansion is used to generate starting values after shifting the order down.
# When x ∈ (7, 19) and ν ∈ (0, 2) Chebyshev approximation is used with higher orders filled by recurrence.
#
#
# [1] https://dlmf.nist.gov/10.2#E3
# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments."
# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments."
# Advances in Computational Mathematics 45.1 (2019): 173-211.
# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions."
# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions."
# Applied and Computational Harmonic Analysis 1.1 (1993): 116-135.
# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime.
# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime.
# Applied and Computational Harmonic Analysis, 39(2), 347-356.
# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method."
# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method."
# Computer physics communications 76.3 (1993): 381-388.
# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables.
# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables.
# Vol. 55. US Government printing office, 1964.
#

Expand Down Expand Up @@ -389,7 +369,7 @@ No checks on arguments are performed and should only be called if certain nu, x
"""
function bessely_positive_args(nu, x::T) where T
iszero(x) && return -T(Inf)

# use forward recurrence if nu is an integer up until it becomes inefficient
(isinteger(nu) && nu < 250) && return besselj_up_recurrence(x, bessely1(x), bessely0(x), 1, nu)[1]

Expand Down Expand Up @@ -418,7 +398,7 @@ end
# combined to calculate both J_v and J_{-v} in the same loop
# J_{-v} always converges slower so just check that convergence
# Combining the loop was faster than using two separate loops
#
#
# this works well for small arguments x < 7.0 for rel. error ~1e-14
# this also works well for nu > 1.35x - 4.5
# for nu > 25 more cancellation occurs near integer values
Expand Down Expand Up @@ -575,7 +555,7 @@ function log_bessely_power_series(v, x::T) where T
logout2 = -v * log(x/2) + log(out2)
spi, cpi = sincospi(v)
#tmp = logout2 + log(abs(sign*(inv(spi)) - exp(logout - logout2) * cpi / spi))
tmp = logout + log((-cpi + exp(logout2) - logout) / spi)
Expand Down
111 changes: 78 additions & 33 deletions src/misc.jl
Original file line number Diff line number Diff line change
@@ -1,47 +1,92 @@
# function to more accurately compute cos(x + xn)
# see https://github.com/heltonmc/Bessels.jl/pull/13
# written by @oscardssmith
function cos_sum(x, xn)
s = x + xn
n, r = reduce_pi02_med(s)
lo = r.lo - ((s - x) - xn)
hi = r.hi + lo
y = Base.Math.DoubleFloat64(hi, r.hi-hi+lo)
n = n&3
using Base.Math: sin_kernel, cos_kernel, sincos_kernel, rem_pio2_kernel, DoubleFloat64, DoubleFloat32

"""
computes sin(sum(xs)) where xs are sorted by absolute value
Doing this is much more accurate than the naive sin(sum(xs))
"""
function sin_sum(xs::Vararg{T})::T where T<:Base.IEEEFloat
n, y = rem_pio2_sum(xs...)
n &= 3
if n == 0
return Base.Math.cos_kernel(y)
return sin_kernel(y)
elseif n == 1
return -Base.Math.sin_kernel(y)
return cos_kernel(y)
elseif n == 2
return -Base.Math.cos_kernel(y)
return -sin_kernel(y)
else
return Base.Math.sin_kernel(y)
return -cos_kernel(y)
end
end
# function to more accurately compute sin(x + xn)
function sin_sum(x, xn)
s = x + xn
n, r = reduce_pi02_med(s)
lo = r.lo - ((s - x) - xn)
hi = r.hi + lo
y = Base.Math.DoubleFloat64(hi, r.hi-hi+lo)
n = n&3

"""
computes sincos(sum(xs)) where xs are sorted by absolute value
Doing this is much more accurate than the naive sincos(sum(xs))
"""
function sincos_sum(xs::Vararg{T})::T where T<:Base.IEEEFloat
n, y = rem_pio2_sum(xs...)
n &= 3
si, co = sincos_kernel(y)
if n == 0
return Base.Math.sin_kernel(y)
return si, co
elseif n == 1
return Base.Math.cos_kernel(y)
return co, -si
elseif n == 2
return -Base.Math.sin_kernel(y)
return -si, -co
else
return -Base.Math.cos_kernel(y)
return -co, si
end
end

function rem_pio2_sum(xs::Vararg{Float64})
n = 0
hi, lo = 0.0, 0.0
for x in xs
if abs(x) <= pi/4
s = x + hi
lo += (x - (s - hi))
else
n_i, y = rem_pio2_kernel(x)
n += n_i
s = y.hi + hi
lo += (y.hi - (s - hi)) + y.lo
end
hi = s
end
while hi > pi/4
hi -= pi/2
lo -= 6.123233995736766e-17
n += 1
end
while hi < -pi/4
hi += pi/2
lo += 6.123233995736766e-17
n -= 1
end
return n, DoubleFloat64(hi, lo)
end

function rem_pio2_sum(xs::Vararg{Float32})
y = 0.0
n = 0
# The minimum cosine or sine of any Float32 that gets reduced is 1.6e-9
# so reducing at 2^22 prevents catastrophic loss of precision.
# There probably is a case where this loses some digits but it is a decent
# tradeoff between accuracy and speed.
@fastmath for x in xs
if x > 0x1p22
n_i, y_i = rem_pio2_kernel(Float32(x))
n += n_i
y += y_i.hi
else
y += x
end
end
n_i, y = rem_pio2_kernel(y)
return n + n_i, DoubleFloat32(y.hi)
end
@inline function reduce_pi02_med(x::Float64)
pio2_1 = 1.57079632673412561417e+00

fn = round(x*(2/pi))
r = muladd(-fn, pio2_1, x)
w = fn * 6.07710050650619224932e-11
y = r-w
return unsafe_trunc(Int, fn), Base.Math.DoubleFloat64(y, (r-y)-w)
function rem_pio2_sum(xs::Vararg{Float16})
y = sum(Float64, xs) #Float16 can be losslessly accumulated in Float64
n, y = rem_pio2_kernel(y)
return n, DoubleFloat32(y.hi)
end

0 comments on commit 4afbc6c

Please sign in to comment.