From df002cc927953613b233321bf4a29c3d0e3920ef Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Fri, 7 Apr 2023 15:50:59 -0400
Subject: [PATCH 1/3] better sin_sum

---
 src/besselj.jl | 46 +++++++-----------------
 src/bessely.jl | 52 +++++++++-------------------
 src/misc.jl    | 94 +++++++++++++++++++++++++++++++-------------------
 3 files changed, 87 insertions(+), 105 deletions(-)

diff --git a/src/besselj.jl b/src/besselj.jl
index 34da648..3c3928b 100644
--- a/src/besselj.jl
+++ b/src/besselj.jl
@@ -59,21 +59,11 @@ function _besselj0(x::Float64)
             q2 = (-1/8, 25/384, -1073/5120, 375733/229376)
             p = evalpoly(x2, p2)
             q = evalpoly(x2, q2)
-            if x > 1e15
-                a = SQ2OPI(T) * sqrt(xinv) * p
-                xn = muladd(xinv, q, -PIO4(T))
-                s_x, c_x = sincos(x)
-                s_xn, c_xn = sincos(xn)
-                return a * (c_x * c_xn - s_x * s_xn)
-            end
         end
 
         a = SQ2OPI(T) * sqrt(xinv) * p
-        xn = muladd(xinv, q, -PIO4(T))
-
-        # the following computes b = cos(x + xn) more accurately
-        # see src/misc.jl
-        b = cos_sum(x, xn)
+        xn = xinv*q
+        b = sin_sum((x,), (pi/4, xn))
         return a * b
     end
 end
@@ -137,21 +127,11 @@ function _besselj1(x::Float64)
             q2 = (3/8, -21/128, 1899/5120, -543483/229376)
             p = evalpoly(x2, p2)
             q = evalpoly(x2, q2)
-            if x > 1e15
-                a = SQ2OPI(T) * sqrt(xinv) * p
-                xn = muladd(xinv, q, -3 * PIO4(T))
-                s_x, c_x = sincos(x)
-                s_xn, c_xn = sincos(xn)
-                return s * a * (c_x * c_xn - s_x * s_xn)
-            end
         end
 
         a = SQ2OPI(T) * sqrt(xinv) * p
-        xn = muladd(xinv, q, -3 * PIO4(T))
-
-        # the following computes b = cos(x + xn) more accurately
-        # see src/misc.jl
-        b = cos_sum(x, xn)
+        xn = xinv*q
+        b = sin_sum((x,), (-pi/4, xn))
         return a * b * s
     end
 end
@@ -182,7 +162,7 @@ end
 #####
 
 besselj0(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli0(z/im)
-besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im 
+besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im
 
 
 #                  Bessel functions of the first kind of order nu
@@ -208,17 +188,17 @@ besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im
 #
 #    For values where the expansions for large arguments and orders are not valid, backward recurrence is employed after shifting the order up
 #    to where `besseljy_debye` is accurate then using downward recurrence. In general, the routine will be the slowest when ν ≈ x as all methods struggle at this point.
-#    
+#
 # [1] http://dlmf.nist.gov/10.2.E2
-# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments." 
+# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments."
 #     Advances in Computational Mathematics 45.1 (2019): 173-211.
-# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions." 
+# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions."
 #     Applied and Computational Harmonic Analysis 1.1 (1993): 116-135.
-# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime. 
+# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime.
 #     Applied and Computational Harmonic Analysis, 39(2), 347-356.
-# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method." 
+# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method."
 #     Computer physics communications 76.3 (1993): 381-388.
-# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables. 
+# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables.
 #     Vol. 55. US Government printing office, 1964.
 #
 
@@ -432,7 +412,7 @@ end
 # Cutoff for Float32 determined from using Float64 precision down to eps(Float32)
 besselj_series_cutoff(v, x::Float32) = (x < 20.0) || v > (14.4 + x*(-0.455 + 0.027*x))
 besselj_series_cutoff(v, x::Float64) = (x < 7.0) || v > (2 + x*(0.109 + 0.062*x))
-# cutoff for Float128 for ~1e-35 relative error 
+# cutoff for Float128 for ~1e-35 relative error
 #besselj_series_cutoff(v, x::AbstractFloat) = (x < 4.0) || v > (x*(0.08 + 0.12*x))
 
 #=
@@ -467,7 +447,7 @@ end
 # On the other hand, shifting the order down avoids any concern about underflow for large orders
 # Shifting the order too high while keeping x fixed could result in numerical underflow
 # Therefore we need to shift up only until the `besseljy_debye` is accurate and need to test that no underflow occurs
-# Shifting the order up decreases the value substantially for high orders and results 
+# Shifting the order up decreases the value substantially for high orders and results
 # in a stable forward recurrence as the values rapidly increase
 function besselj_recurrence(nu, x)
     # shift order up to where expansions are valid see src/U_polynomials.jl
diff --git a/src/bessely.jl b/src/bessely.jl
index fd50753..c172392 100644
--- a/src/bessely.jl
+++ b/src/bessely.jl
@@ -64,7 +64,7 @@ function _bessely0_compute(x::Float64)
         z = w * w
         p = evalpoly(z, PP_y0(T)) / evalpoly(z, PQ_y0(T))
         q = evalpoly(z, QP_y0(T)) / evalpoly(z, QQ_y0(T))
-        xn = x - PIO4(T)
+        xn = x - pi/4
         sc = sincos(xn)
         p = p * sc[1] + w * q * sc[2]
         return p * SQ2OPI(T) / sqrt(x)
@@ -81,21 +81,11 @@ function _bessely0_compute(x::Float64)
             q2 = (-1/8, 25/384, -1073/5120, 375733/229376)
             p = evalpoly(x2, p2)
             q = evalpoly(x2, q2)
-            if x > 1e15
-                a = SQ2OPI(T) * sqrt(xinv) * p
-                xn = muladd(xinv, q, -PIO4(T))
-                s_x, c_x = sincos(x)
-                s_xn, c_xn = sincos(xn)
-                return a * (s_x * c_xn + c_x * s_xn)
-            end
         end
 
         a = SQ2OPI(T) * sqrt(xinv) * p
-        xn = muladd(xinv, q, -PIO4(T))
-
-        # the following computes b = sin(x + xn) more accurately
-        # see src/misc.jl
-        b = sin_sum(x, xn)
+        xn = xinv*q
+        b = sin_sum((x,), (-pi/4, xn))
         return a * b
     end
 end
@@ -170,21 +160,11 @@ function _bessely1_compute(x::Float64)
             q2 = (3/8, -21/128, 1899/5120, -543483/229376)
             p = evalpoly(x2, p2)
             q = evalpoly(x2, q2)
-            if x > 1e15
-                a = SQ2OPI(T) * sqrt(xinv) * p
-                xn = muladd(xinv, q, - 3 * PIO4(T))
-                s_x, c_x = sincos(x)
-                s_xn, c_xn = sincos(xn)
-                return a * (s_x * c_xn + c_x * s_xn)
-            end
         end
 
         a = SQ2OPI(T) * sqrt(xinv) * p
-        xn = muladd(xinv, q, - 3 * PIO4(T))
-
-        # the following computes b = sin(x + xn) more accurately
-        # see src/misc.jl
-        b = sin_sum(x, xn)
+        xn = xinv*q
+        b = sin_sum((-x,), (-pi/4, -xn))
         return a * b
     end
 end
@@ -225,7 +205,7 @@ end
 #    for positive arguments and orders. For integer orders up to 250, forward recurrence is used starting from
 #    `bessely0` and `bessely1` routines for calculation of Y_{n}(x) of the zero and first order.
 #    For small arguments, Y_{ν}(x) is calculated from the power series (`bessely_power_series(nu, x`) form of J_{ν}(x) using the connection formula [1].
-#    
+#
 #    When x < ν and ν being reasonably large, the debye asymptotic expansion (Eq. 33; [3]) is used `besseljy_debye(nu, x)`.
 #    When x > ν and x being reasonably large, the Hankel function is calculated from the debye expansion (Eq. 29; [3]) with `hankel_debye(nu, x)`
 #    and Y_{n}(x) is calculated from the imaginary part of the Hankel function.
@@ -234,20 +214,20 @@ end
 #
 #    For values where the expansions for large arguments and orders are not valid, forward recurrence is employed after shifting the order down
 #    to where these expansions are valid then using recurrence. In general, the routine will be the slowest when ν ≈ x as all methods struggle at this point.
-#    Additionally, the Hankel expansion is only accurate (to Float64 precision) when x > 19 and the power series can only be computed for x < 7 
+#    Additionally, the Hankel expansion is only accurate (to Float64 precision) when x > 19 and the power series can only be computed for x < 7
 #    without loss of precision. Therefore, when x > 19 the Hankel expansion is used to generate starting values after shifting the order down.
 #    When x ∈ (7, 19) and ν ∈ (0, 2) Chebyshev approximation is used with higher orders filled by recurrence.
-#    
+#
 # [1] https://dlmf.nist.gov/10.2#E3
-# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments." 
+# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments."
 #     Advances in Computational Mathematics 45.1 (2019): 173-211.
-# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions." 
+# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions."
 #     Applied and Computational Harmonic Analysis 1.1 (1993): 116-135.
-# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime. 
+# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime.
 #     Applied and Computational Harmonic Analysis, 39(2), 347-356.
-# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method." 
+# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method."
 #     Computer physics communications 76.3 (1993): 381-388.
-# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables. 
+# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables.
 #     Vol. 55. US Government printing office, 1964.
 #
 
@@ -389,7 +369,7 @@ No checks on arguments are performed and should only be called if certain nu, x
 """
 function bessely_positive_args(nu, x::T) where T
     iszero(x) && return -T(Inf)
-    
+
     # use forward recurrence if nu is an integer up until it becomes inefficient
     (isinteger(nu) && nu < 250) && return besselj_up_recurrence(x, bessely1(x), bessely0(x), 1, nu)[1]
 
@@ -418,7 +398,7 @@ end
 # combined to calculate both J_v and J_{-v} in the same loop
 # J_{-v} always converges slower so just check that convergence
 # Combining the loop was faster than using two separate loops
-# 
+#
 # this works well for small arguments x < 7.0 for rel. error ~1e-14
 # this also works well for nu > 1.35x - 4.5
 # for nu > 25 more cancellation occurs near integer values
@@ -575,7 +555,7 @@ function log_bessely_power_series(v, x::T) where T
     logout2 = -v * log(x/2) + log(out2)
 
     spi, cpi = sincospi(v)
-   
+
     #tmp = logout2 + log(abs(sign*(inv(spi)) - exp(logout - logout2) * cpi / spi))
     tmp = logout + log((-cpi + exp(logout2) - logout) / spi)
 
diff --git a/src/misc.jl b/src/misc.jl
index 6be8c72..08014f9 100644
--- a/src/misc.jl
+++ b/src/misc.jl
@@ -1,47 +1,69 @@
-# function to more accurately compute cos(x + xn)
-# see https://github.com/heltonmc/Bessels.jl/pull/13
-# written by @oscardssmith
-function cos_sum(x, xn)
-    s = x + xn
-    n, r = reduce_pi02_med(s)
-    lo = r.lo - ((s - x) - xn)
-    hi = r.hi + lo
-    y = Base.Math.DoubleFloat64(hi, r.hi-hi+lo)
+using Base.Math: sin_kernel, cos_kernel, rem_pio2_kernel, DoubleFloat64, DoubleFloat32
+function sin_double(n,y)
     n = n&3
     if n == 0
-        return Base.Math.cos_kernel(y)
+        return sin_kernel(y)
     elseif n == 1
-        return -Base.Math.sin_kernel(y)
+        return cos_kernel(y)
     elseif n == 2
-        return -Base.Math.cos_kernel(y)
+        return -sin_kernel(y)
     else
-        return Base.Math.sin_kernel(y)
+        return -cos_kernel(y)
     end
 end
-# function to more accurately compute sin(x + xn)
-function sin_sum(x, xn)
-    s = x + xn
-    n, r = reduce_pi02_med(s)
-    lo = r.lo - ((s - x) - xn)
-    hi = r.hi + lo
-    y = Base.Math.DoubleFloat64(hi, r.hi-hi+lo)
-    n = n&3
-    if n == 0
-        return Base.Math.sin_kernel(y)
-    elseif n == 1
-        return Base.Math.cos_kernel(y)
-    elseif n == 2
-        return -Base.Math.sin_kernel(y)
-    else
-        return -Base.Math.cos_kernel(y)
+
+"""
+    computes sin(sum(xs)+sum(xlos)) where xs, xlos are sorted by absolute value
+    and xlos are all <=pi/4 (and therefore don't need to be reduced)
+    Doing this is much more accurate than the naive sin(sum(xs)+sum(xlos))
+"""
+function sin_sum(xs::Tuple{Vararg{Float64}}, xlos::Tuple{Vararg{Float64}})
+    n = 0
+    hi, lo = 0.0, 0.0
+    for x in xs
+        n_i, y = rem_pio2_kernel(x)
+        n += n_i
+        s = y.hi + hi
+        lo += (y.hi - (s - hi)) + y.lo
+        hi = s
+    end
+    for x in xlos
+        s = x + hi
+        lo += (x - (s - hi))
+        hi = s
     end
+    while hi > pi/4
+        hi -= pi/2
+        lo -= 6.123233995736766e-17
+        n += 1
+    end
+    while hi < -pi/4
+        hi += pi/2
+        lo += 6.123233995736766e-17
+        n -= 1
+    end
+    sin_double(n,DoubleFloat64(hi, lo))
 end
-@inline function reduce_pi02_med(x::Float64)
-    pio2_1 = 1.57079632673412561417e+00
 
-    fn = round(x*(2/pi))
-    r  = muladd(-fn, pio2_1, x)
-    w  = fn * 6.07710050650619224932e-11
-    y = r-w
-    return unsafe_trunc(Int, fn), Base.Math.DoubleFloat64(y, (r-y)-w)
+function sin_sum(xs::Tuple{Vararg{Float32}}, xlos::Tuple{Vararg{Float32}})
+    n = 0
+    y = 0.0
+    for x in xs
+        n_i, y_i = rem_pio2_kernel(x)
+        n += n_i
+        y += y_i.hi
+    end
+    for x in xlos
+        y += x
+    end
+    while y > pi/4
+        y -= pi/2
+        n += 1
+    end
+    while y < -pi/4
+        y += pi/2
+        n -= 1
+    end
+
+    sin_double(n,DoubleFloat32(y))
 end

From 5f1c77f6feef20fd0fdaae414085a06effe2da67 Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Sun, 9 Apr 2023 17:31:29 -0400
Subject: [PATCH 2/3] sinsum improvements

---
 src/besselj.jl |  4 +--
 src/bessely.jl |  4 +--
 src/misc.jl    | 71 ++++++++++++++++++++++++++++----------------------
 3 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/src/besselj.jl b/src/besselj.jl
index 3c3928b..32eb9bb 100644
--- a/src/besselj.jl
+++ b/src/besselj.jl
@@ -63,7 +63,7 @@ function _besselj0(x::Float64)
 
         a = SQ2OPI(T) * sqrt(xinv) * p
         xn = xinv*q
-        b = sin_sum((x,), (pi/4, xn))
+        b = sin_sum(x, pi/4, xn)
         return a * b
     end
 end
@@ -131,7 +131,7 @@ function _besselj1(x::Float64)
 
         a = SQ2OPI(T) * sqrt(xinv) * p
         xn = xinv*q
-        b = sin_sum((x,), (-pi/4, xn))
+        b = sin_sum(x, -pi/4, xn)
         return a * b * s
     end
 end
diff --git a/src/bessely.jl b/src/bessely.jl
index c172392..a66e4c1 100644
--- a/src/bessely.jl
+++ b/src/bessely.jl
@@ -85,7 +85,7 @@ function _bessely0_compute(x::Float64)
 
         a = SQ2OPI(T) * sqrt(xinv) * p
         xn = xinv*q
-        b = sin_sum((x,), (-pi/4, xn))
+        b = sin_sum(x, -pi/4, xn)
         return a * b
     end
 end
@@ -164,7 +164,7 @@ function _bessely1_compute(x::Float64)
 
         a = SQ2OPI(T) * sqrt(xinv) * p
         xn = xinv*q
-        b = sin_sum((-x,), (-pi/4, -xn))
+        b = sin_sum(-x, -pi/4, -xn)
         return a * b
     end
 end
diff --git a/src/misc.jl b/src/misc.jl
index 08014f9..97f0c96 100644
--- a/src/misc.jl
+++ b/src/misc.jl
@@ -1,6 +1,12 @@
-using Base.Math: sin_kernel, cos_kernel, rem_pio2_kernel, DoubleFloat64, DoubleFloat32
-function sin_double(n,y)
-    n = n&3
+using Base.Math: sin_kernel, cos_kernel, sincos_kernel, rem_pio2_kernel, DoubleFloat64, DoubleFloat32
+
+"""
+    computes sin(sum(xs)) where xs are sorted by absolute value
+    Doing this is much more accurate than the naive sin(sum(xs))
+"""
+function sin_sum(xs::Vararg{T})::T where T<:Base.IEEEFloat
+    n, y = rem_pio2_sum(xs...)
+    n &= 3
     if n == 0
         return sin_kernel(y)
     elseif n == 1
@@ -13,21 +19,42 @@ function sin_double(n,y)
 end
 
 """
-    computes sin(sum(xs)+sum(xlos)) where xs, xlos are sorted by absolute value
-    and xlos are all <=pi/4 (and therefore don't need to be reduced)
-    Doing this is much more accurate than the naive sin(sum(xs)+sum(xlos))
+    computes sincos(sum(xs)) where xs are sorted by absolute value
+    Doing this is much more accurate than the naive sincos(sum(xs))
 """
-function sin_sum(xs::Tuple{Vararg{Float64}}, xlos::Tuple{Vararg{Float64}})
+function sincos_sum(xs::Vararg{T})::T where T<:Base.IEEEFloat
+    n, y = rem_pio2_sum(xs...)
+    n &= 3
+    si, co = sincos_kernel(y)
+    if n == 0
+        return si, co
+    elseif n == 1
+        return co, -si
+    elseif n == 2
+        return -si, -co
+    else
+        return -co, si
+    end
+end
+
+function rem_pio2_sum(xs::Vararg{Float64})
     n = 0
     hi, lo = 0.0, 0.0
-    for x in xs
+    small_start = length(xs)+1
+    for i in eachindex(xs)
+        x = xs[i]
+        if abs(x) <= pi/4
+            small_start = i
+            break
+        end
         n_i, y = rem_pio2_kernel(x)
         n += n_i
         s = y.hi + hi
         lo += (y.hi - (s - hi)) + y.lo
         hi = s
     end
-    for x in xlos
+    for i in small_start:length(xs)
+        x = xs[i]
         s = x + hi
         lo += (x - (s - hi))
         hi = s
@@ -42,28 +69,10 @@ function sin_sum(xs::Tuple{Vararg{Float64}}, xlos::Tuple{Vararg{Float64}})
         lo += 6.123233995736766e-17
         n -= 1
     end
-    sin_double(n,DoubleFloat64(hi, lo))
+    return n, DoubleFloat64(hi, lo)
 end
 
-function sin_sum(xs::Tuple{Vararg{Float32}}, xlos::Tuple{Vararg{Float32}})
-    n = 0
-    y = 0.0
-    for x in xs
-        n_i, y_i = rem_pio2_kernel(x)
-        n += n_i
-        y += y_i.hi
-    end
-    for x in xlos
-        y += x
-    end
-    while y > pi/4
-        y -= pi/2
-        n += 1
-    end
-    while y < -pi/4
-        y += pi/2
-        n -= 1
-    end
-
-    sin_double(n,DoubleFloat32(y))
+function rem_pio2_sum(xs::Vararg{Union{Float32,Float64}})
+    n, y = rem_pio2_kernel(sum(Float64, xs))
+    return n, DoubleFloat32(y.hi)
 end

From daf86ceef0d792b53141bbbb859dbe011d857ad9 Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Sun, 9 Apr 2023 22:08:21 -0400
Subject: [PATCH 3/3] improve Float32 rem_pio2_sum

---
 src/misc.jl | 48 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/src/misc.jl b/src/misc.jl
index 97f0c96..b456d81 100644
--- a/src/misc.jl
+++ b/src/misc.jl
@@ -40,23 +40,16 @@ end
 function rem_pio2_sum(xs::Vararg{Float64})
     n = 0
     hi, lo = 0.0, 0.0
-    small_start = length(xs)+1
-    for i in eachindex(xs)
-        x = xs[i]
+    for x in xs
         if abs(x) <= pi/4
-            small_start = i
-            break
+            s = x + hi
+            lo += (x - (s - hi))
+        else
+            n_i, y = rem_pio2_kernel(x)
+            n += n_i
+            s = y.hi + hi
+            lo += (y.hi - (s - hi)) + y.lo
         end
-        n_i, y = rem_pio2_kernel(x)
-        n += n_i
-        s = y.hi + hi
-        lo += (y.hi - (s - hi)) + y.lo
-        hi = s
-    end
-    for i in small_start:length(xs)
-        x = xs[i]
-        s = x + hi
-        lo += (x - (s - hi))
         hi = s
     end
     while hi > pi/4
@@ -72,7 +65,28 @@ function rem_pio2_sum(xs::Vararg{Float64})
     return n, DoubleFloat64(hi, lo)
 end
 
-function rem_pio2_sum(xs::Vararg{Union{Float32,Float64}})
-    n, y = rem_pio2_kernel(sum(Float64, xs))
+function rem_pio2_sum(xs::Vararg{Float32})
+    y = 0.0
+    n = 0
+    # The minimum cosine or sine of any Float32 that gets reduced is 1.6e-9
+    # so reducing at 2^22 prevents catastrophic loss of precision.
+    # There probably is a case where this loses some digits but it is a decent
+    # tradeoff between accuracy and speed.
+    @fastmath for x in xs
+        if x > 0x1p22
+            n_i, y_i = rem_pio2_kernel(Float32(x))
+            n += n_i
+            y += y_i.hi
+        else
+            y += x
+        end
+    end
+    n_i, y = rem_pio2_kernel(y)
+    return n + n_i, DoubleFloat32(y.hi)
+end
+
+function rem_pio2_sum(xs::Vararg{Float16})
+    y = sum(Float64, xs) #Float16 can be losslessly accumulated in Float64
+    n, y = rem_pio2_kernel(y)
     return n, DoubleFloat32(y.hi)
 end