From 9c7d8d852354453fbaea19335c4d9a8a0c7a2d70 Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Mon, 24 Apr 2023 15:40:40 -0400 Subject: [PATCH 01/19] update complex airy --- src/Airy/cairy.jl | 197 +++++++++++++++++++++++++++++++--------------- 1 file changed, 135 insertions(+), 62 deletions(-) diff --git a/src/Airy/cairy.jl b/src/Airy/cairy.jl index 1263bfd..f279207 100644 --- a/src/Airy/cairy.jl +++ b/src/Airy/cairy.jl @@ -50,35 +50,55 @@ airyai(z::Number) = _airyai(float(z)) function _airyai(z::Complex{T}) where T <: Union{Float32, Float64} if ~isfinite(z) - if abs(angle(z)) < 2*T(π)/3 + if abs(angle(z)) < T(2)*π/3 return exp(-z) else return 1 / z end end + + x, y = reim(z) + + check_conj = false + if y < zero(T) + z = conj(z) + check_conj = true + end + + if airy_large_argument_cutoff(x, y) + r = airyaix_large_args(z)[1] * exp(-T(2)/3 * z * sqrt(z)) + elseif airyai_levin_cutoff(x, y) + r = airyaix_levin(z, Val(17)) * exp(-T(2)/3 * z * sqrt(z)) + elseif airyai_power_series_cutoff(x, y) + r = airyai_power_series(z)[1] + else + c = cispi(one(T)/3) + r = c * _airyai(-z*c) + conj(c) * _airyai(-z*conj(c)) + end + + return check_conj ? conj(r) : r +end + +function _airyaix(z::Complex{T}) where T <: Union{Float32, Float64} x, y = real(z), imag(z) - airy_large_argument_cutoff(z) && return airyai_large_args(z)[1] - airyai_power_series_cutoff(x, y) && return airyai_power_series(z)[1] - if x > zero(T) - # use relation to besselk (http://dlmf.nist.gov/9.6.E1) - zz = 2 * z * sqrt(z) / 3 - return sqrt(z / 3) * besselk_continued_fraction_shift(one(T)/3, zz) / T(π) + check_conj = false + if y < zero(T) + z = conj(z) + check_conj = true + end + + if airy_large_argument_cutoff(x, y) + r = airyaix_large_args(z)[1] + elseif airyai_levin_cutoff(x, y) + r = airyaix_levin(z, Val(17)) + elseif airyai_power_series_cutoff(x, y) + r = airyai_power_series(z)[1] * exp(2 * z * sqrt(z) / 3) else - # z is close to the negative real axis - # for imag(z) == 0 use reflection to compute in terms of bessel functions of first kind (http://dlmf.nist.gov/9.6.E5) - # use computation for real numbers then convert to input type for stability - # for imag(z) != 0 use rotation identity (http://dlmf.nist.gov/9.2.E14) - if iszero(y) - xabs = abs(x) - xx = 2 * xabs * sqrt(xabs) / 3 - Jv, Yv = besseljy_positive_args(one(T)/3, xx) - Jmv = (Jv - sqrt(T(3)) * Yv) / 2 - return convert(eltype(z), sqrt(xabs) * (Jmv + Jv) / 3) - else - return cispi(one(T)/3) * _airyai(-z*cispi(one(T)/3)) + cispi(-one(T)/3) * _airyai(-z*cispi(-one(T)/3)) - end + c = cispi(one(T)/3) + r = exp(2 * z * sqrt(z) / 3) * (c * _airyai(-z*c) + conj(c) * _airyai(-z*conj(c))) end + return check_conj ? conj(r) : r end """ @@ -111,29 +131,48 @@ function _airyaiprime(z::Complex{T}) where T <: Union{Float32, Float64} return 1 / z end end - x, y = real(z), imag(z) - airy_large_argument_cutoff(z) && return airyai_large_args(z)[2] - airyai_power_series_cutoff(x, y) && return airyai_power_series(z)[2] - if x > zero(T) - # use relation to besselk (http://dlmf.nist.gov/9.6.E2) - zz = 2 * z * sqrt(z) / 3 - return -z * besselk_continued_fraction_shift(T(2)/3, zz) / (T(π) * sqrt(T(3))) + x, y = reim(z) + + check_conj = false + if y < zero(T) + z = conj(z) + check_conj = true + end + + if airy_large_argument_cutoff(x, y) + r = airyaix_large_args(z)[2] * exp(-T(2)/3 * z * sqrt(z)) + elseif airyai_levin_cutoff(x, y) + r = airyaiprimex_levin(z, Val(17)) * exp(-T(2)/3 * z * sqrt(z)) + elseif airyai_power_series_cutoff(x, y) + r = airyai_power_series(z)[2] else - # z is close to the negative real axis - # for imag(z) == 0 use reflection to compute in terms of bessel functions of first kind (http://dlmf.nist.gov/9.6.E5) - # use computation for real numbers then convert to input type for stability - # for imag(z) != 0 use rotation identity (http://dlmf.nist.gov/9.2.E14) - if iszero(y) - xabs = abs(x) - xx = 2 * xabs * sqrt(xabs) / 3 - Jv, Yv = besseljy_positive_args(T(2)/3, xx) - Jmv = -(Jv + sqrt(T(3))*Yv) / 2 - return convert(eltype(z), xabs * (Jv - Jmv) / 3) - else - return -(cispi(T(2)/3) * _airyaiprime(-z * cispi(one(T)/3)) + cispi(-T(2)/3) * _airyaiprime(-z * cispi(-one(T)/3))) - end + r = -cispi(T(2)/3) * _airyaiprime(-z*cispi(one(T)/3)) - cispi(-T(2)/3) * _airyaiprime(-z*cispi(-one(T)/3)) end + + return check_conj ? conj(r) : r +end + +function _airyaiprimex(z::Complex{T}) where T <: Union{Float32, Float64} + x, y = reim(z) + + check_conj = false + if y < zero(T) + z = conj(z) + check_conj = true + end + + if airy_large_argument_cutoff(x, y) + r = airyaix_large_args(z)[2] + elseif airyai_levin_cutoff(x, y) + r = airyaiprimex_levin(z, Val(17)) + elseif airyai_power_series_cutoff(x, y) + r = airyai_power_series(z)[2] * exp(T(2)/3 * z * sqrt(z)) + else + r = exp(2 * z * sqrt(z) / 3) * (-cispi(T(2)/3) * _airyaiprime(-z*cispi(one(T)/3)) - cispi(-T(2)/3) * _airyaiprime(-z*cispi(-one(T)/3))) + end + + return check_conj ? conj(r) : r end """ @@ -259,6 +298,7 @@ function airyai_power_series(z::Complex{T}) where T z2 = z * z z3 = z2 * z p = SIMDMath.horner_simd(z3, pack_AIRYAI_POW_COEF) + # TODO: use complex shufflevector to SIMD this muladd ai = muladd(-p[2], z, p[1]) aip = muladd(p[4], z2, -p[3]) return ai, aip @@ -274,7 +314,8 @@ function airybi_power_series(z::Complex{T}) where T end # cutoffs for power series valid for both airyai and airyaiprime -airyai_power_series_cutoff(x::T, y::T) where T <: Float64 = x < 2 && abs(y) > -1.4*(x + 5.5) +airyai_power_series_cutoff(x::T, y::T) where T <: Float64 = x > -4.5 || (abs(angle(complex(x, y)) < 9pi/10)) +#airyai_power_series_cutoff(x::T, y::T) where T <: Float64 = x < 2 && abs(y) > -1.4*(x + 5.5) airyai_power_series_cutoff(x::T, y::T) where T <: Float32 = x < 4.5f0 && abs(y) > -1.4f0*(x + 9.5f0) # cutoffs for power series valid for both airybi and airybiprime @@ -299,18 +340,15 @@ end ##### Large argument expansion for airy functions ##### airy_large_argument_cutoff(z::ComplexOrReal{Float64}) = abs(z) > 8.3 +airy_large_argument_cutoff(x::Float64, y::Float64) = (x^2 + y^2) > 68.89000000000001 # 8.3^2 +airy_large_argument_cutoff(x::Float32, y::Float32) = (x^2 + y^2) > 16.0f0 # 8.3^2 + airy_large_argument_cutoff(z::ComplexOrReal{Float32}) = abs(z) > 4 # valid in 0 <= angle(z) <= pi # use conjugation for bottom half plane -function airyai_large_args(z::Complex{T}) where T - if imag(z) < zero(T) - out = conj.(airyaix_large_args(conj(z))) - else - out = airyaix_large_args(z) - end - return out .* exp(-2/3 * z * sqrt(z)) -end +airyai_large_args(z::Complex{T}) where T = airyaix_large_args(z) .* exp(-2/3 * z * sqrt(z)) + function airybi_large_args(z::Complex{T}) where T if imag(z) < zero(T) out = conj.(airybix_large_args(conj(z))) @@ -360,8 +398,8 @@ end invx3 = @fastmath inv(z^3) p = SIMDMath.horner_simd(invx3, pack_AIRY_ASYM_COEF) - pvec1 = SIMDMath.ComplexVec{2, Float64}((p.re[1], p.re[3]), (p.im[1], p.im[3])) - pvec2 = SIMDMath.ComplexVec{2, Float64}((p.re[2], p.re[4]), (p.im[2], p.im[4])) + pvec1 = SIMDMath.ComplexVec((p[1], p[3])) + pvec2 = SIMDMath.ComplexVec((p[2], p[4])) zvec = SIMDMath.ComplexVec{2, Float64}((xsqrx.re, xsqrx.re), (xsqrx.im, xsqrx.im)) zvec = SIMDMath.fmul(zvec, pvec2) @@ -372,20 +410,57 @@ end return A, B, C, D end +@generated function airyaix_levin(x::Complex{T}, ::Val{N}) where {T <: Union{Float32, Float64}, N} + :( + begin + xsqr = sqrt(x) + out = zero(typeof(x)) + t = GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4 + a = inv(4*xsqr*x) + + l = @ntuple $N i -> begin + out += t + t *= -3 * a * (i - 5//6) * (i - 1//6) / i + invt = @fastmath inv(t) + Vec{4, T}((reim(out * invt)..., reim(invt)...)) + end + return @fastmath levin_transform(l) / (sqrt(T(π)^3) * sqrt(xsqr)) + end + ) +end +@generated function airyaiprimex_levin(x::Complex{T}, ::Val{N}) where {T <: Union{Float32, Float64}, N} + :( + begin + xsqr = sqrt(x) + out = zero(typeof(x)) + t = -GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4 + a = inv(4*xsqr*x) + + l = @ntuple $N i -> begin + out += t + t *= -3 * a * (i - 7//6) * (i + 1//6) / i + invt = @fastmath inv(t) + Vec{4, T}((reim(out * invt)..., reim(invt)...)) + end + return @fastmath levin_transform(l) * sqrt(xsqr) / sqrt(T(π)^3) + end + ) +end + +airyai_levin_cutoff(x::T, y::T) where T <: Union{Float32, Float64} = x > 2.0 || (x > 1.0 && y > 4.3) + # Asymptotic Expansion coefficients # to generate asymptotic expansions then split into even and odd coefficients # tuple(Float64.([3^k * gamma(k + 1//6) * gamma(k + 5//6) / (2^(2k+2) * gamma(k+1)) for k in big"0":big"24"])...) # tuple(Float64.([(3)^k * gamma(k - 1//6) * gamma(k + 7//6) / (2^(2k+2) * gamma(k+1)) for k in big"0":big"24"])...) -const AIRY_ASYM_COEF = ( +const pack_AIRY_ASYM_COEF = SIMDMath.pack_poly(( (1.5707963267948966, 0.13124057851910487, 0.4584353787485384, 5.217255928936184, 123.97197893818594, 5038.313653002081, 312467.7049060495, 2.746439545069411e7, 3.2482560591146026e9, 4.97462635569055e11, 9.57732308323407e13, 2.2640712393216476e16, 6.447503420809101e18), (0.1636246173744684, 0.20141783231057064, 1.3848568733028765, 23.555289417250567, 745.2667344964557, 37835.063701047824, 2.8147130917899106e6, 2.8856687720069575e8, 3.8998976239149216e10, 6.718472897263214e12, 1.4370735281142392e15, 3.7367429394637446e17, 1.1608192617215053e20), (-1.5707963267948966, 0.15510250188621483, 0.4982993247266722, 5.515384839161109, 129.24738229725767, 5209.103946324185, 321269.61208650155, 2.812618811215662e7, 3.3166403972012258e9, 5.0676100258903735e11, 9.738286496397669e13, 2.298637212441062e16, 6.537678293827411e18), (0.22907446432425577, 0.22511404787652015, 1.4803642438754887, 24.70432792540913, 773.390007496322, 38999.21950723391, 2.8878225227454924e6, 2.950515261265541e8, 3.97712331943799e10, 6.837383921993536e12, 1.460066704564067e15, 3.7912939312807334e17, 1.176400728321794e20) -) - -const pack_AIRY_ASYM_COEF = SIMDMath.pack_poly(AIRY_ASYM_COEF) +)) # Power series coefficients @@ -402,21 +477,19 @@ const pack_AIRY_ASYM_COEF = SIMDMath.pack_poly(AIRY_ASYM_COEF) # tuple(Float64.([3^(-2k + 1//6) / (gamma(k + 1//3) * gamma(k+1)) for k in big"0":big"31"])...) # tuple(Float64.([3^(-2k - 7//6) / (gamma(k + 5//3) * gamma(k+1)) for k in big"0":big"31"])...) -const AIRYAI_POW_COEF = ( +const pack_AIRYAI_POW_COEF = SIMDMath.pack_poly(( (0.3550280538878172, 0.05917134231463621, 0.00197237807715454, 2.7394139960479725e-5, 2.0753136333696763e-7, 9.882445873188935e-10, 3.2295574748983444e-12, 7.689422559281772e-15, 1.3930113332032197e-17, 1.984346628494615e-20, 2.280858193671971e-23, 2.159903592492397e-26, 1.7142092003907912e-29, 1.156686370034272e-32, 6.717110162800651e-36, 3.392479880202349e-39, 1.5037588121464314e-42, 5.897093380966397e-46, 2.060479867563381e-49, 6.455137429709841e-53, 1.8234851496355484e-56, 4.6684207619957715e-60, 1.0882099678311822e-63, 2.3192880814816327e-67, 4.536948516200377e-71, 8.174682011171851e-75, 1.3610859159460292e-78, 2.1004412283117732e-82, 3.0126810503611208e-86, 4.026571839563112e-90, 5.026931135534472e-94, 5.875328582906115e-98), (0.2588194037928068, 0.021568283649400565, 0.0005135305630809659, 5.705895145344065e-6, 3.657625093169273e-8, 1.5240104554871968e-10, 4.456170922477184e-13, 9.645391607093471e-16, 1.6075652678489119e-18, 2.1264090844562326e-21, 2.286461381135734e-24, 2.0378443682136668e-27, 1.5299131893495997e-30, 9.8071358291641e-34, 5.430307768086435e-37, 2.623337086032094e-40, 1.1153644073265705e-43, 4.2057481422570536e-47, 1.4160768155747653e-50, 4.2833539491069734e-54, 1.1703152866412495e-57, 2.902567675201512e-61, 6.56392509091251e-65, 1.3589907020522795e-68, 2.585598748196879e-72, 4.536138154731366e-76, 7.361470552955803e-80, 1.108321372019844e-83, 1.5522708291594454e-87, 2.0275219816607177e-91, 2.4756068152145513e-95, 2.8318540553815505e-99), (0.2588194037928068, 0.08627313459760226, 0.003594713941566761, 5.705895145344065e-5, 4.7549126211200545e-7, 2.438416728779515e-9, 8.46672475270665e-12, 2.1219861535605637e-14, 4.0189131696222797e-17, 5.953945436477451e-20, 7.088030281520775e-23, 6.928670851926468e-26, 5.660678800593519e-29, 3.9228543316656404e-32, 2.335032340277167e-35, 1.206735059574763e-38, 5.4652855959001957e-42, 2.1869890339736677e-45, 7.78842248566121e-49, 2.4843452904820447e-52, 7.138923248511622e-56, 1.8576433121289676e-59, 4.397829810911382e-63, 9.512934914365956e-67, 1.8874870861837215e-70, 3.4474649975958385e-74, 5.815561736835084e-78, 9.08823525056272e-82, 1.3194302047855285e-85, 1.7842193438614314e-89, 2.2528022018452416e-93, 2.6619428120586576e-97), (0.1775140269439086, 0.011834268462927242, 0.0002465472596443175, 2.4903763600436113e-6, 1.48236688097834e-8, 5.81320345481702e-11, 1.6147787374491722e-13, 3.3432271996877272e-16, 5.35773589693546e-19, 6.842574581015913e-22, 7.12768185522491e-25, 6.171153121406848e-28, 4.511076843133661e-31, 2.8211862683762735e-34, 1.526615946091057e-37, 7.21804229830287e-41, 3.0075176242928623e-44, 1.1126591284842257e-47, 3.6794283349346094e-51, 1.094091089781329e-54, 2.941105080057336e-58, 7.182185787685802e-62, 1.6003087762223267e-65, 3.2666029316642714e-69, 6.131011508378888e-73, 1.0616470144379027e-76, 1.7013573949325364e-80, 2.5306520823033415e-84, 3.5031175004199077e-88, 4.5242380219810255e-92, 5.4640555821026875e-96, 6.184556403059069e-100) -) -const AIRYBI_POW_COEF = ( +)) + +const pack_AIRYBI_POW_COEF = SIMDMath.pack_poly(( (0.6149266274460007, 0.10248777124100013, 0.0034162590413666706, 4.7448042241203764e-5, 3.5945486546366485e-7, 1.7116898355412612e-9, 5.5937576324877815e-12, 1.3318470553542337e-14, 2.412766404627235e-17, 3.4369891803806764e-20, 3.9505622762996283e-23, 3.7410627616473754e-26, 2.9690974298788695e-29, 2.0034395613217741e-32, 1.163437608200798e-35, 5.875947516165647e-39, 2.604586664967042e-42, 1.0214065352811929e-45, 3.568855818592568e-49, 1.1180625998097016e-52, 3.1583689260161066e-56, 8.08594195088609e-60, 1.884834953586501e-63, 4.017124794515134e-67, 7.858225341383282e-71, 1.415896457906898e-74, 2.3574699598849448e-78, 3.6380709257483713e-82, 5.2181166462254325e-86, 6.9742270064493885e-90, 8.706900132895616e-94, 1.0176367616755045e-97), (0.4482883573538264, 0.03735736311281886, 0.0008894610264956872, 9.882900294396524e-6, 6.335192496408029e-8, 2.6396635401700117e-10, 7.718314444941556e-13, 1.6706308322384318e-15, 2.7843847203973864e-18, 3.683048571954215e-21, 3.960267281671199e-24, 3.52964998366417e-27, 2.6498873751232507e-30, 1.698645753284135e-33, 9.405568955061657e-37, 4.5437531183872734e-40, 1.9318678224435686e-43, 7.284569466227635e-47, 2.4527169919958367e-50, 7.418986666654073e-54, 2.0270455373371784e-57, 5.0273946858560976e-61, 1.136905175453663e-64, 2.353840942968246e-68, 4.478388399863482e-72, 7.856821754146459e-76, 1.275044101614161e-79, 1.919668927452817e-83, 2.688611943211228e-87, 3.511771085699096e-91, 4.28787678351538e-95, 4.904915103540814e-99), (0.4482883573538264, 0.14942945245127545, 0.0062262271854698105, 9.882900294396525e-5, 8.235750245330437e-7, 4.223461664272019e-9, 1.4664797445388956e-11, 3.67538783092455e-14, 6.960961800993466e-17, 1.0312536001471802e-19, 1.2276828573180715e-22, 1.2000809944458178e-25, 9.804583287956028e-29, 6.79458301313654e-32, 4.0443946506765124e-35, 2.0901264344581458e-38, 9.466152329973487e-42, 3.78797612243837e-45, 1.3489943455977101e-48, 4.3030122666593625e-52, 1.236497777775679e-55, 3.2175325989479025e-59, 7.617264675539542e-63, 1.6476886600777724e-66, 3.269223531900342e-70, 5.97118453315131e-74, 1.007284840275187e-77, 1.5741285205113097e-81, 2.2853201517295438e-85, 3.0903585554152047e-89, 3.901967872998996e-93, 4.6106201973283655e-97), (0.30746331372300034, 0.020497554248200024, 0.0004270323801708338, 4.313458385563978e-6, 2.567534753311892e-8, 1.0068763738478007e-10, 2.796878816243891e-13, 5.790639371105364e-16, 9.279870787027826e-19, 1.1851686828898885e-21, 1.2345507113436338e-24, 1.068875074756393e-27, 7.813414289154919e-31, 4.8864379544433515e-34, 2.6441763822745408e-37, 1.2502015991841802e-40, 5.209173329934083e-44, 1.9271821420399865e-47, 6.372956818915299e-51, 1.895021355609664e-54, 5.0941434290582364e-58, 1.2439910693670906e-61, 2.771816108215443e-65, 5.657922245795964e-69, 1.0619223434301734e-72, 1.838826568710257e-76, 2.946837449856181e-80, 4.383217982829363e-84, 6.067577495610968e-88, 7.836210119606054e-92, 9.464021883582192e-96, 1.071196591237373e-99) -) - -const pack_AIRYAI_POW_COEF = SIMDMath.pack_poly(AIRYAI_POW_COEF) -const pack_AIRYBI_POW_COEF = SIMDMath.pack_poly(AIRYBI_POW_COEF) +)) # promote Float16 values for internalf in (:airyai, :airyaiprime, :airybi, :airybiprime), T in (:ComplexF16,) From e070e1bb8693c2f6bae3798fe27a13556c15d685 Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Mon, 24 Apr 2023 15:40:55 -0400 Subject: [PATCH 02/19] add separate precompile statement --- src/precompile.jl | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 src/precompile.jl diff --git a/src/precompile.jl b/src/precompile.jl new file mode 100644 index 0000000..1c313ce --- /dev/null +++ b/src/precompile.jl @@ -0,0 +1,3 @@ +precompile(besselj, (Float64, Float64)) +precompile(airyai, (Float64,)) +precompile(airyai, (ComplexF64,)) From 0a7746e76914a5877c5de4cc4b3848de3433e69d Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Mon, 24 Apr 2023 15:41:09 -0400 Subject: [PATCH 03/19] update file name and add Levin --- Project.toml | 2 +- src/Bessels.jl | 4 +- src/math.jl | 165 +++++++++++++++++++++++++++++++++++++++++++++++++ src/misc.jl | 92 --------------------------- 4 files changed, 168 insertions(+), 95 deletions(-) create mode 100644 src/math.jl delete mode 100644 src/misc.jl diff --git a/Project.toml b/Project.toml index 62953f7..cb22803 100644 --- a/Project.toml +++ b/Project.toml @@ -7,7 +7,7 @@ SIMDMath = "5443be0b-e40a-4f70-a07e-dcd652efc383" [compat] julia = "1.8" -SIMDMath = "0.2.3" +SIMDMath = "0.2.5" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/Bessels.jl b/src/Bessels.jl index 110cc8e..830892d 100644 --- a/src/Bessels.jl +++ b/src/Bessels.jl @@ -62,11 +62,11 @@ include("constants.jl") include("math_constants.jl") include("U_polynomials.jl") include("recurrence.jl") -include("misc.jl") +include("math.jl") include("Polynomials/besselj_polys.jl") include("asymptotics.jl") include("gamma.jl") -precompile(besselj, (Float64, Float64)) +include("precompile.jl") end diff --git a/src/math.jl b/src/math.jl new file mode 100644 index 0000000..7fd11ae --- /dev/null +++ b/src/math.jl @@ -0,0 +1,165 @@ +using Base.Math: sin_kernel, cos_kernel, sincos_kernel, rem_pio2_kernel, DoubleFloat64, DoubleFloat32 + +""" + computes sin(sum(xs)) where xs are sorted by absolute value + Doing this is much more accurate than the naive sin(sum(xs)) +""" +function sin_sum(xs::Vararg{T, N})::T where {T<:Base.IEEEFloat, N} + n, y = rem_pio2_sum(xs...) + n &= 3 + if n == 0 + return sin_kernel(y) + elseif n == 1 + return cos_kernel(y) + elseif n == 2 + return -sin_kernel(y) + else + return -cos_kernel(y) + end +end + +""" + computes sincos(sum(xs)) where xs are sorted by absolute value + Doing this is much more accurate than the naive sincos(sum(xs)) +""" +function sincos_sum(xs::Vararg{T, N})::T where {T<:Base.IEEEFloat, N} + n, y = rem_pio2_sum(xs...) + n &= 3 + si, co = sincos_kernel(y) + if n == 0 + return si, co + elseif n == 1 + return co, -si + elseif n == 2 + return -si, -co + else + return -co, si + end +end + +function rem_pio2_sum(xs::Vararg{Float64, N}) where N + n = 0 + hi, lo = 0.0, 0.0 + for x in xs + if abs(x) <= pi/4 + s = x + hi + lo += (x - (s - hi)) + else + n_i, y = rem_pio2_kernel(x) + n += n_i + s = y.hi + hi + lo += (y.hi - (s - hi)) + y.lo + end + hi = s + end + while hi > pi/4 + hi -= pi/2 + lo -= 6.123233995736766e-17 + n += 1 + end + while hi < -pi/4 + hi += pi/2 + lo += 6.123233995736766e-17 + n -= 1 + end + return n, DoubleFloat64(hi, lo) +end + +function rem_pio2_sum(xs::Vararg{Float32, N}) where N + y = 0.0 + n = 0 + # The minimum cosine or sine of any Float32 that gets reduced is 1.6e-9 + # so reducing at 2^22 prevents catastrophic loss of precision. + # There probably is a case where this loses some digits but it is a decent + # tradeoff between accuracy and speed. + @fastmath for x in xs + if x > 0x1p22 + n_i, y_i = rem_pio2_kernel(Float32(x)) + n += n_i + y += y_i.hi + else + y += x + end + end + n_i, y = rem_pio2_kernel(y) + return n + n_i, DoubleFloat32(y.hi) +end + +function rem_pio2_sum(xs::Vararg{Float16, N}) where N + y = sum(Float64, xs) #Float16 can be losslessly accumulated in Float64 + n, y = rem_pio2_kernel(y) + return n, DoubleFloat32(y.hi) +end + +# Levin's Sequence transformation + +using Base.Cartesian +using SIMDMath: fmadd, Vec, FloatTypes + +#@inline levin_scale(B::T, n, k) where T = -(B + n) * (B + n + k)^(k - one(T)) / (B + n + k + one(T))^k +@inline levin_scale(B::T, n, k) where T = -(B + n + k) * (B + n + k - 1) / ((B + n + 2k) * (B + n + 2k - 1)) + +# implementation for real numbers +@inline @generated function levin_transform(s::NTuple{N, Vec{2, T}}) where {N, T <: FloatTypes} + len = N - 1 + :( + begin + @nexprs $N i -> a_{i} = s[i] + @nexprs $len k -> (@nexprs ($len-k) i -> a_{i} = fmadd(a_{i}, levin_scale(one(T), i, k-1), a_{i+1})) + return (a_1[1] / a_1[2]) + end + ) +end + +# implementation for complex numbers +@inline @generated function levin_transform(s::NTuple{N, Vec{4, T}}) where {N, T <: FloatTypes} + len = N - 1 + :( + begin + @nexprs $N i -> a_{i} = s[i] + @nexprs $len k -> (@nexprs ($len-k) i -> a_{i} = fmadd(a_{i}, levin_scale(one(T), i, k-1), a_{i+1})) + return (complex(a_1[1], a_1[2]) / complex(a_1[3], a_1[4])) + end + ) +end + +@generated function besselkx_levin(v, x::T, ::Val{N}) where {T <: FloatTypes, N} + :( + begin + # generate a sequence of partial sums and weights of the asymptotic expansion for large argument besselk + l = let (out, term, fv2) = (one(typeof(x)), one(typeof(x)), 4*v^2) + @ntuple $N i -> begin + offset = muladd(2, i, -1) + term *= muladd(offset, -offset, fv2) / (8 * x * i) + out += term + # this check is relatively cheap but if the normal sum converges then we should return value + abs(term) <= eps(T) && return out * sqrt(π / 2x) + # need to return the weights and weighted partial sums + invterm = inv(term) + Vec{2, T}((out * invterm, invterm)) + end + end + return levin_transform(l) * sqrt(π / 2x) + end + ) +end +@generated function besselkx_levin(v, x::Complex{T}, ::Val{N}) where {T <: FloatTypes, N} + :( + begin + # generate a sequence of partial sums and weights of the asymptotic expansion for large argument besselk + l = let (out, term, fv2, invx) = (one(typeof(x)), one(typeof(x)), 4*v^2, inv(8*x)) + @ntuple $N i -> begin + offset = muladd(2, i, -1) + term *= @fastmath muladd(offset, -offset, fv2) * invx / i + out += term + # this check is relatively cheap but if the normal sum converges then we should return value + (abs(real(term)) <= eps(T) && abs(imag(term)) <= eps(T)) && return out * sqrt(π / 2x) + # need to return the weights and weighted partial sums + invterm = @fastmath inv(term) + Vec{4, T}((reim(out * invterm)..., reim(invterm)...)) + end + end + return levin_transform(l) * sqrt(π / 2x) + end + ) +end diff --git a/src/misc.jl b/src/misc.jl deleted file mode 100644 index 7947dc9..0000000 --- a/src/misc.jl +++ /dev/null @@ -1,92 +0,0 @@ -using Base.Math: sin_kernel, cos_kernel, sincos_kernel, rem_pio2_kernel, DoubleFloat64, DoubleFloat32 - -""" - computes sin(sum(xs)) where xs are sorted by absolute value - Doing this is much more accurate than the naive sin(sum(xs)) -""" -function sin_sum(xs::Vararg{T, N})::T where {T<:Base.IEEEFloat, N} - n, y = rem_pio2_sum(xs...) - n &= 3 - if n == 0 - return sin_kernel(y) - elseif n == 1 - return cos_kernel(y) - elseif n == 2 - return -sin_kernel(y) - else - return -cos_kernel(y) - end -end - -""" - computes sincos(sum(xs)) where xs are sorted by absolute value - Doing this is much more accurate than the naive sincos(sum(xs)) -""" -function sincos_sum(xs::Vararg{T, N})::T where {T<:Base.IEEEFloat, N} - n, y = rem_pio2_sum(xs...) - n &= 3 - si, co = sincos_kernel(y) - if n == 0 - return si, co - elseif n == 1 - return co, -si - elseif n == 2 - return -si, -co - else - return -co, si - end -end - -function rem_pio2_sum(xs::Vararg{Float64, N}) where N - n = 0 - hi, lo = 0.0, 0.0 - for x in xs - if abs(x) <= pi/4 - s = x + hi - lo += (x - (s - hi)) - else - n_i, y = rem_pio2_kernel(x) - n += n_i - s = y.hi + hi - lo += (y.hi - (s - hi)) + y.lo - end - hi = s - end - while hi > pi/4 - hi -= pi/2 - lo -= 6.123233995736766e-17 - n += 1 - end - while hi < -pi/4 - hi += pi/2 - lo += 6.123233995736766e-17 - n -= 1 - end - return n, DoubleFloat64(hi, lo) -end - -function rem_pio2_sum(xs::Vararg{Float32, N}) where N - y = 0.0 - n = 0 - # The minimum cosine or sine of any Float32 that gets reduced is 1.6e-9 - # so reducing at 2^22 prevents catastrophic loss of precision. - # There probably is a case where this loses some digits but it is a decent - # tradeoff between accuracy and speed. - @fastmath for x in xs - if x > 0x1p22 - n_i, y_i = rem_pio2_kernel(Float32(x)) - n += n_i - y += y_i.hi - else - y += x - end - end - n_i, y = rem_pio2_kernel(y) - return n + n_i, DoubleFloat32(y.hi) -end - -function rem_pio2_sum(xs::Vararg{Float16, N}) where N - y = sum(Float64, xs) #Float16 can be losslessly accumulated in Float64 - n, y = rem_pio2_kernel(y) - return n, DoubleFloat32(y.hi) -end From f9d40d811294db4cf429389fbb0042633d5d15b8 Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Mon, 24 Apr 2023 18:12:10 -0400 Subject: [PATCH 04/19] add some constant prop --- src/Airy/cairy.jl | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/Airy/cairy.jl b/src/Airy/cairy.jl index f279207..1ddfab8 100644 --- a/src/Airy/cairy.jl +++ b/src/Airy/cairy.jl @@ -50,7 +50,7 @@ airyai(z::Number) = _airyai(float(z)) function _airyai(z::Complex{T}) where T <: Union{Float32, Float64} if ~isfinite(z) - if abs(angle(z)) < T(2)*π/3 + if abs(angle(z)) < T(2π/3) return exp(-z) else return 1 / z @@ -66,9 +66,9 @@ function _airyai(z::Complex{T}) where T <: Union{Float32, Float64} end if airy_large_argument_cutoff(x, y) - r = airyaix_large_args(z)[1] * exp(-T(2)/3 * z * sqrt(z)) + r = airyaix_large_args(z)[1] * exp(-T(2/3) * z * sqrt(z)) elseif airyai_levin_cutoff(x, y) - r = airyaix_levin(z, Val(17)) * exp(-T(2)/3 * z * sqrt(z)) + r = airyaix_levin(z, Val(17)) * exp(-T(2/3) * z * sqrt(z)) elseif airyai_power_series_cutoff(x, y) r = airyai_power_series(z)[1] else @@ -93,10 +93,10 @@ function _airyaix(z::Complex{T}) where T <: Union{Float32, Float64} elseif airyai_levin_cutoff(x, y) r = airyaix_levin(z, Val(17)) elseif airyai_power_series_cutoff(x, y) - r = airyai_power_series(z)[1] * exp(2 * z * sqrt(z) / 3) + r = airyai_power_series(z)[1] * exp(T(2/3) * z * sqrt(z)) else c = cispi(one(T)/3) - r = exp(2 * z * sqrt(z) / 3) * (c * _airyai(-z*c) + conj(c) * _airyai(-z*conj(c))) + r = exp(T(2/3) * z * sqrt(z)) * (c * _airyai(-z*c) + conj(c) * _airyai(-z*conj(c))) end return check_conj ? conj(r) : r end @@ -125,7 +125,7 @@ airyaiprime(z::Number) = _airyaiprime(float(z)) function _airyaiprime(z::Complex{T}) where T <: Union{Float32, Float64} if ~isfinite(z) - if abs(angle(z)) < 2*T(π)/3 + if abs(angle(z)) < T(2π/3) return -exp(-z) else return 1 / z @@ -141,13 +141,13 @@ function _airyaiprime(z::Complex{T}) where T <: Union{Float32, Float64} end if airy_large_argument_cutoff(x, y) - r = airyaix_large_args(z)[2] * exp(-T(2)/3 * z * sqrt(z)) + r = airyaix_large_args(z)[2] * exp(-T(2/3) * z * sqrt(z)) elseif airyai_levin_cutoff(x, y) - r = airyaiprimex_levin(z, Val(17)) * exp(-T(2)/3 * z * sqrt(z)) + r = airyaiprimex_levin(z, Val(17)) * exp(-T(2/3) * z * sqrt(z)) elseif airyai_power_series_cutoff(x, y) r = airyai_power_series(z)[2] else - r = -cispi(T(2)/3) * _airyaiprime(-z*cispi(one(T)/3)) - cispi(-T(2)/3) * _airyaiprime(-z*cispi(-one(T)/3)) + r = -cispi(T(2/3)) * _airyaiprime(-z*cispi(one(T)/3)) - cispi(-T(2/3)) * _airyaiprime(-z*cispi(-one(T)/3)) end return check_conj ? conj(r) : r @@ -167,9 +167,9 @@ function _airyaiprimex(z::Complex{T}) where T <: Union{Float32, Float64} elseif airyai_levin_cutoff(x, y) r = airyaiprimex_levin(z, Val(17)) elseif airyai_power_series_cutoff(x, y) - r = airyai_power_series(z)[2] * exp(T(2)/3 * z * sqrt(z)) + r = airyai_power_series(z)[2] * exp(T(2/3) * z * sqrt(z)) else - r = exp(2 * z * sqrt(z) / 3) * (-cispi(T(2)/3) * _airyaiprime(-z*cispi(one(T)/3)) - cispi(-T(2)/3) * _airyaiprime(-z*cispi(-one(T)/3))) + r = exp(T(2/3) * z * sqrt(z)) * (-cispi(T(2)/3) * _airyaiprime(-z*cispi(one(T)/3)) - cispi(-T(2)/3) * _airyaiprime(-z*cispi(-one(T)/3))) end return check_conj ? conj(r) : r @@ -199,7 +199,7 @@ airybi(z::Number) = _airybi(float(z)) function _airybi(z::Complex{T}) where T <: Union{Float32, Float64} if ~isfinite(z) - if abs(angle(z)) < 2π/3 + if abs(angle(z)) < T(2π/3) return exp(z) else return 1 / z @@ -210,7 +210,7 @@ function _airybi(z::Complex{T}) where T <: Union{Float32, Float64} airybi_power_series_cutoff(x, y) && return airybi_power_series(z)[1] if x > zero(T) - zz = 2 * z * sqrt(z) / 3 + zz = T(2/3) * z * sqrt(z) shift = 20 order = one(T)/3 inu, inum1 = besseli_power_series_inu_inum1(order + shift, zz) @@ -222,7 +222,7 @@ function _airybi(z::Complex{T}) where T <: Union{Float32, Float64} else if iszero(y) xabs = abs(x) - xx = 2 * xabs * sqrt(xabs) / 3 + xx = T(2/3) * xabs * sqrt(xabs) Jv, Yv = besseljy_positive_args(one(T)/3, xx) Jmv = (Jv - sqrt(T(3)) * Yv) / 2 return convert(eltype(z), sqrt(xabs/3) * (Jmv - Jv)) @@ -256,7 +256,7 @@ airybiprime(z::Number) = _airybiprime(float(z)) function _airybiprime(z::Complex{T}) where T <: Union{Float32, Float64} if ~isfinite(z) - if abs(angle(z)) < 2*T(π)/3 + if abs(angle(z)) < T(2π/3) return exp(z) else return -1 / z @@ -267,9 +267,9 @@ function _airybiprime(z::Complex{T}) where T <: Union{Float32, Float64} airybi_power_series_cutoff(x, y) && return airybi_power_series(z)[2] if x > zero(T) - zz = 2 * z * sqrt(z) / 3 + zz = T(2/3) * z * sqrt(z) shift = 20 - order = T(2)/3 + order = T(2/3) inu, inum1 = besseli_power_series_inu_inum1(order + shift, zz) inu, inum1 = besselk_down_recurrence(zz, inum1, inu, order + shift - 1, order) @@ -279,12 +279,12 @@ function _airybiprime(z::Complex{T}) where T <: Union{Float32, Float64} else if iszero(y) xabs = abs(x) - xx = 2 * xabs * sqrt(xabs) / 3 + xx = T(2/3) * xabs * sqrt(xabs) Jv, Yv = besseljy_positive_args(T(2)/3, xx) Jmv = -(Jv + sqrt(T(3))*Yv) / 2 return convert(eltype(z), xabs * (Jv + Jmv) / sqrt(T(3))) else - return -(cispi(T(2)/3) * _airybiprime(-z*cispi(one(T)/3)) + cispi(-T(2)/3) * _airybiprime(-z*cispi(-one(T)/3))) + return -(cispi(T(2/3)) * _airybiprime(-z*cispi(one(T)/3)) + cispi(-T(2/3)) * _airybiprime(-z*cispi(-one(T)/3))) end end end @@ -355,7 +355,7 @@ function airybi_large_args(z::Complex{T}) where T else out = airybix_large_args(z) end - return out .* exp(2/3 * z * sqrt(z)) + return out .* exp(T(2/3) * z * sqrt(z)) end @inline function airyaix_large_args(z::Complex{T}) where T @@ -364,7 +364,7 @@ end A, B, C, D = compute_airy_asy_coef(z, xsqrx) if (real(z) < 0.0) && abs(imag(z)) < sqrt(3)*abs(real(z)) - e = exp(4/3 * z * xsqr) + e = exp(T(4/3) * z * xsqr) ai = muladd(B*im, e, A) aip = muladd(-D*im, e, C) else @@ -424,7 +424,7 @@ end invt = @fastmath inv(t) Vec{4, T}((reim(out * invt)..., reim(invt)...)) end - return @fastmath levin_transform(l) / (sqrt(T(π)^3) * sqrt(xsqr)) + return @fastmath levin_transform(l) / (T(π)^(3//2) * sqrt(xsqr)) end ) end @@ -442,7 +442,7 @@ end invt = @fastmath inv(t) Vec{4, T}((reim(out * invt)..., reim(invt)...)) end - return @fastmath levin_transform(l) * sqrt(xsqr) / sqrt(T(π)^3) + return @fastmath levin_transform(l) * sqrt(xsqr) / T(π)^(3//2) end ) end From d201df1277dab9ed4b2804a7e8f03813b396de9b Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Mon, 24 Apr 2023 18:22:52 -0400 Subject: [PATCH 05/19] improve expression --- src/Airy/cairy.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Airy/cairy.jl b/src/Airy/cairy.jl index 1ddfab8..8d8b73e 100644 --- a/src/Airy/cairy.jl +++ b/src/Airy/cairy.jl @@ -420,7 +420,7 @@ end l = @ntuple $N i -> begin out += t - t *= -3 * a * (i - 5//6) * (i - 1//6) / i + t *= a * (3 - 3*i - (5//12)/i) invt = @fastmath inv(t) Vec{4, T}((reim(out * invt)..., reim(invt)...)) end From 647fd33cafed4a291cd70e95fa185985104d963b Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Mon, 24 Apr 2023 18:29:23 -0400 Subject: [PATCH 06/19] save one mul --- src/Airy/cairy.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Airy/cairy.jl b/src/Airy/cairy.jl index 8d8b73e..18c4398 100644 --- a/src/Airy/cairy.jl +++ b/src/Airy/cairy.jl @@ -416,7 +416,7 @@ end xsqr = sqrt(x) out = zero(typeof(x)) t = GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4 - a = inv(4*xsqr*x) + a = T(0.25) / (xsqr*x) l = @ntuple $N i -> begin out += t @@ -434,7 +434,7 @@ end xsqr = sqrt(x) out = zero(typeof(x)) t = -GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4 - a = inv(4*xsqr*x) + a = T(0.25) / (xsqr*x) l = @ntuple $N i -> begin out += t From 13804c976f95c1e4a45f5afea81d8035bf6b7dd3 Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Tue, 25 Apr 2023 14:53:33 -0400 Subject: [PATCH 07/19] add airybi levin --- src/Airy/cairy.jl | 132 ++++++++++++++++++++++++++-------------------- test/airy_test.jl | 4 +- 2 files changed, 76 insertions(+), 60 deletions(-) diff --git a/src/Airy/cairy.jl b/src/Airy/cairy.jl index 18c4398..0bb4ce7 100644 --- a/src/Airy/cairy.jl +++ b/src/Airy/cairy.jl @@ -149,7 +149,7 @@ function _airyaiprime(z::Complex{T}) where T <: Union{Float32, Float64} else r = -cispi(T(2/3)) * _airyaiprime(-z*cispi(one(T)/3)) - cispi(-T(2/3)) * _airyaiprime(-z*cispi(-one(T)/3)) end - + return check_conj ? conj(r) : r end @@ -206,32 +206,17 @@ function _airybi(z::Complex{T}) where T <: Union{Float32, Float64} end end x, y = real(z), imag(z) - airy_large_argument_cutoff(z) && return airybi_large_args(z)[1] - airybi_power_series_cutoff(x, y) && return airybi_power_series(z)[1] - - if x > zero(T) - zz = T(2/3) * z * sqrt(z) - shift = 20 - order = one(T)/3 - inu, inum1 = besseli_power_series_inu_inum1(order + shift, zz) - inu, inum1 = besselk_down_recurrence(zz, inum1, inu, order + shift - 1, order) - - inu2, inum2 = besseli_power_series_inu_inum1(-order + shift, zz) - inu2, inum2 = besselk_down_recurrence(zz, inum2, inu2, -order + shift - 1, -order) - return sqrt(z/3) * (inu + inu2) + + if airy_large_argument_cutoff(z) + return airybi_large_args(z)[1] + elseif x > 1.1 && y > 4.2 + return airybi_levin(z, Val(16)) + elseif angle(z) < 7pi/8 || x > -4.5 + return airybi_power_series(z)[1] else - if iszero(y) - xabs = abs(x) - xx = T(2/3) * xabs * sqrt(xabs) - Jv, Yv = besseljy_positive_args(one(T)/3, xx) - Jmv = (Jv - sqrt(T(3)) * Yv) / 2 - return convert(eltype(z), sqrt(xabs/3) * (Jmv - Jv)) - else - return cispi(one(T)/3) * _airybi(-z * cispi(one(T)/3)) + cispi(-one(T)/3) * _airybi(-z*cispi(-one(T)/3)) - end + return cispi(-1/6) * _airyai(z * cispi(-2/3)) + cispi(1/6) * _airyai(z*cispi(2/3)) end end - """ airybiprime(z) @@ -263,29 +248,15 @@ function _airybiprime(z::Complex{T}) where T <: Union{Float32, Float64} end end x, y = real(z), imag(z) - airy_large_argument_cutoff(z) && return airybi_large_args(z)[2] - airybi_power_series_cutoff(x, y) && return airybi_power_series(z)[2] - - if x > zero(T) - zz = T(2/3) * z * sqrt(z) - shift = 20 - order = T(2/3) - inu, inum1 = besseli_power_series_inu_inum1(order + shift, zz) - inu, inum1 = besselk_down_recurrence(zz, inum1, inu, order + shift - 1, order) - - inu2, inum2 = besseli_power_series_inu_inum1(-order + shift, zz) - inu2, inum2 = besselk_down_recurrence(zz, inum2, inu2, -order + shift - 1, -order) - return z / sqrt(3) * (inu + inu2) + + if airy_large_argument_cutoff(z) + return airybi_large_args(z)[2] + elseif x > 1.1 && y > 4.2 + return airybiprime_levin(z, Val(16)) + elseif angle(z) < 7pi/8 || x > -4.5 + return airybi_power_series(z)[2] else - if iszero(y) - xabs = abs(x) - xx = T(2/3) * xabs * sqrt(xabs) - Jv, Yv = besseljy_positive_args(T(2)/3, xx) - Jmv = -(Jv + sqrt(T(3))*Yv) / 2 - return convert(eltype(z), xabs * (Jv + Jmv) / sqrt(T(3))) - else - return -(cispi(T(2/3)) * _airybiprime(-z*cispi(one(T)/3)) + cispi(-T(2/3)) * _airybiprime(-z*cispi(-one(T)/3))) - end + return cispi(-5/6) * airyaiprime(z * cispi(-2/3)) + cispi(5/6) * airyaiprime(z*cispi(2/3)) end end @@ -325,17 +296,6 @@ airyai_power_series_cutoff(x::T, y::T) where T <: Float32 = x < 4.5f0 && abs(y) airybi_power_series_cutoff(x::T, y::T) where T <: Float64 = (abs(y) > -1.4*(x + 5.5) && abs(y) < -2.2*(x - 4)) || (x > zero(T) && abs(y) < 3) airybi_power_series_cutoff(x::T, y::T) where T <: Float32 = abs(complex(x, y)) < 9 -# calculates besselk from the power series of besseli using the continued fraction and wronskian -# this shift the order higher first to avoid cancellation in the power series of besseli along the imaginary axis -# for real arguments this is not needed because besseli can be computed stably over the entire real axis -function besselk_continued_fraction_shift(nu, x) - shift = 20 - inu, inum1 = besseli_power_series_inu_inum1(nu + shift, x) - inu, inum1 = besselk_down_recurrence(x, inum1, inu, nu + shift - 1, nu) - H_knu = besselk_ratio_knu_knup1(nu-1, x) - return 1 / (x * (inum1 + inu / H_knu)) -end - ##### ##### Large argument expansion for airy functions ##### @@ -442,7 +402,63 @@ end invt = @fastmath inv(t) Vec{4, T}((reim(out * invt)..., reim(invt)...)) end - return @fastmath levin_transform(l) * sqrt(xsqr) / T(π)^(3//2) + return @fastmath levin_transform(l) * sqrt(xsqr) / T(π)^(3//2) + end + ) +end + +@generated function airybi_levin(x::Complex{T}, ::Val{N}) where {T <: Union{Float32, Float64}, N} + :( + begin + xsqr = sqrt(x) + out = zero(typeof(x)) + t = GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4 + a = T(0.25) / (xsqr*x) + + l = @ntuple $N i -> begin + out += t + t *= -3 * a * (i - 5//6) * (i - 1//6) / i + invt = @fastmath inv(t) + Vec{4, T}((reim(out * invt)..., reim(invt)...)) + end + out = zero(typeof(x)) + t = GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4 + l2 = @ntuple $N i -> begin + out += t + t *= 3 * a * (i - 5//6) * (i - 1//6) / i + invt = @fastmath inv(t) + Vec{4, T}((reim(out * invt)..., reim(invt)...)) + end + e = exp(-2/3 * x * sqrt(x)) + return @fastmath (e*im*levin_transform(l) + 2*levin_transform(l2)/e) / (sqrt(T(π)^3) * sqrt(xsqr)) + end + ) +end + +@generated function airybiprime_levin(x::Complex{T}, ::Val{N}) where {T <: Union{Float32, Float64}, N} + :( + begin + xsqr = sqrt(x) + out = zero(typeof(x)) + t = GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4 + a = T(0.25) / (xsqr*x) + + l = @ntuple $N i -> begin + out += t + t *= -3 * a * (i - 7//6) * (i + 1//6) / i + invt = @fastmath inv(t) + Vec{4, T}((reim(out * invt)..., reim(invt)...)) + end + out = zero(typeof(x)) + t = GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4 + l2 = @ntuple $N i -> begin + out += t + t *= 3 * a * (i - 7//6) * (i + 1//6) / i + invt = @fastmath inv(t) + Vec{4, T}((reim(out * invt)..., reim(invt)...)) + end + e = exp(-2/3 * x * sqrt(x)) + return @fastmath -(e*im*levin_transform(l) - 2*levin_transform(l2)/e) * sqrt(xsqr) / (sqrt(T(π)^3)) end ) end diff --git a/test/airy_test.jl b/test/airy_test.jl index 7e949ff..4422fde 100644 --- a/test/airy_test.jl +++ b/test/airy_test.jl @@ -52,8 +52,8 @@ for x in [0.0, 0.01, 0.5, 1.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 50.0], a z = x*exp(im*a) @test isapprox(airyai(z), SpecialFunctions.airyai(z), rtol=5e-10) @test isapprox(airyaiprime(z), SpecialFunctions.airyaiprime(z), rtol=5e-10) - @test isapprox(airybi(z), SpecialFunctions.airybi(z), rtol=1e-11) - @test isapprox(airybiprime(z), SpecialFunctions.airybiprime(z), rtol=1e-11) + @test isapprox(airybi(z), SpecialFunctions.airybi(z), rtol=5e-10) + @test isapprox(airybiprime(z), SpecialFunctions.airybiprime(z), rtol=5e-10) end From 33d30af3e0b514a5061b0e9d8ee7ba75fabd0098 Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Wed, 26 Apr 2023 11:04:41 -0400 Subject: [PATCH 08/19] avoid complex div in airy levin --- src/Airy/cairy.jl | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/src/Airy/cairy.jl b/src/Airy/cairy.jl index 0bb4ce7..e0f9d76 100644 --- a/src/Airy/cairy.jl +++ b/src/Airy/cairy.jl @@ -374,17 +374,24 @@ end :( begin xsqr = sqrt(x) - out = zero(typeof(x)) + xsqrx = xsqr * x + t = GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4 - a = T(0.25) / (xsqr*x) + t2 = 1 / t + a = @fastmath inv(4*xsqrx) + a2 = 4 * xsqrx + + s = zero(typeof(x)) l = @ntuple $N i -> begin - out += t - t *= a * (3 - 3*i - (5//12)/i) - invt = @fastmath inv(t) - Vec{4, T}((reim(out * invt)..., reim(invt)...)) + s += t + + t *= -a * (3 * (i - 5//6) * (i - 1//6) / i) + t2 *= -a2 * (i / (3 * (i - 5//6) * (i - 1//6))) + + Vec{4, T}((reim(s * t2)..., reim(t2)...)) end - return @fastmath levin_transform(l) / (T(π)^(3//2) * sqrt(xsqr)) + return levin_transform(l) / (T(π)^(3//2) * sqrt(xsqr)) end ) end @@ -392,17 +399,24 @@ end :( begin xsqr = sqrt(x) - out = zero(typeof(x)) + xsqrx = xsqr * x + t = -GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4 - a = T(0.25) / (xsqr*x) + t2 = 1 / t + + a = @fastmath inv(4*xsqrx) + a2 = 4 * xsqrx + s = zero(typeof(x)) l = @ntuple $N i -> begin - out += t - t *= -3 * a * (i - 7//6) * (i + 1//6) / i - invt = @fastmath inv(t) - Vec{4, T}((reim(out * invt)..., reim(invt)...)) + s += t + + t *= -a * (3 * (i - 7//6) * (i + 1//6) / i) + t2 *= -a2 * (i / (3 * (i - 7//6) * (i + 1//6))) + + Vec{4, T}((reim(s * t2)..., reim(t2)...)) end - return @fastmath levin_transform(l) * sqrt(xsqr) / T(π)^(3//2) + return levin_transform(l) * sqrt(xsqr) / T(π)^(3//2) end ) end From 5d395f7d066dc45f64698ff45b7da57d9f2d6bae Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Wed, 26 Apr 2023 12:53:59 -0400 Subject: [PATCH 09/19] rearrange besselk_levin and add tests --- src/Bessels.jl | 9 +++++---- src/besselk.jl | 38 ++++++++++++++++++++++++++++++++++++++ src/math.jl | 41 ----------------------------------------- test/besselk_test.jl | 6 ++++++ 4 files changed, 49 insertions(+), 45 deletions(-) diff --git a/src/Bessels.jl b/src/Bessels.jl index 830892d..9ca2c57 100644 --- a/src/Bessels.jl +++ b/src/Bessels.jl @@ -42,6 +42,11 @@ export airybiprimex const ComplexOrReal{T} = Union{T,Complex{T}} +include("math.jl") +include("constants.jl") +include("math_constants.jl") + +include("gamma.jl") include("besseli.jl") include("besselj.jl") include("besselk.jl") @@ -58,14 +63,10 @@ include("Float128/besselk.jl") include("Float128/bessely.jl") include("Float128/constants.jl") -include("constants.jl") -include("math_constants.jl") include("U_polynomials.jl") include("recurrence.jl") -include("math.jl") include("Polynomials/besselj_polys.jl") include("asymptotics.jl") -include("gamma.jl") include("precompile.jl") diff --git a/src/besselk.jl b/src/besselk.jl index 7939286..2286b45 100644 --- a/src/besselk.jl +++ b/src/besselk.jl @@ -570,3 +570,41 @@ function _besselk_large_argument(v, x::ComplexOrReal{T}) where T end return res end + +##### +##### Levin sequence transform for K_{nu}(x) +##### + +@generated function besselkx_levin(v, x::T, ::Val{N}) where {T <: FloatTypes, N} + :( + begin + l = let (out, term, fv2) = (one(typeof(x)), one(typeof(x)), 4*v^2) + @ntuple $N i -> begin + offset = muladd(2, i, -1) + term *= muladd(offset, -offset, fv2) / (8 * x * i) + out += term + invterm = inv(term) + Vec{2, T}((out * invterm, invterm)) + end + end + return levin_transform(l) * sqrt(π / 2x) + end + ) +end + +@generated function besselkx_levin(v, x::Complex{T}, ::Val{N}) where {T <: FloatTypes, N} + :( + begin + l = let (out, term, fv2, invx) = (one(typeof(x)), one(typeof(x)), 4*v^2, inv(8*x)) + @ntuple $N i -> begin + offset = muladd(2, i, -1) + term *= @fastmath muladd(offset, -offset, fv2) * invx / i + out += term + invterm = @fastmath inv(term) + Vec{4, T}((reim(out * invterm)..., reim(invterm)...)) + end + end + return levin_transform(l) * sqrt(π / 2x) + end + ) +end \ No newline at end of file diff --git a/src/math.jl b/src/math.jl index 7fd11ae..0d57396 100644 --- a/src/math.jl +++ b/src/math.jl @@ -122,44 +122,3 @@ end end ) end - -@generated function besselkx_levin(v, x::T, ::Val{N}) where {T <: FloatTypes, N} - :( - begin - # generate a sequence of partial sums and weights of the asymptotic expansion for large argument besselk - l = let (out, term, fv2) = (one(typeof(x)), one(typeof(x)), 4*v^2) - @ntuple $N i -> begin - offset = muladd(2, i, -1) - term *= muladd(offset, -offset, fv2) / (8 * x * i) - out += term - # this check is relatively cheap but if the normal sum converges then we should return value - abs(term) <= eps(T) && return out * sqrt(π / 2x) - # need to return the weights and weighted partial sums - invterm = inv(term) - Vec{2, T}((out * invterm, invterm)) - end - end - return levin_transform(l) * sqrt(π / 2x) - end - ) -end -@generated function besselkx_levin(v, x::Complex{T}, ::Val{N}) where {T <: FloatTypes, N} - :( - begin - # generate a sequence of partial sums and weights of the asymptotic expansion for large argument besselk - l = let (out, term, fv2, invx) = (one(typeof(x)), one(typeof(x)), 4*v^2, inv(8*x)) - @ntuple $N i -> begin - offset = muladd(2, i, -1) - term *= @fastmath muladd(offset, -offset, fv2) * invx / i - out += term - # this check is relatively cheap but if the normal sum converges then we should return value - (abs(real(term)) <= eps(T) && abs(imag(term)) <= eps(T)) && return out * sqrt(π / 2x) - # need to return the weights and weighted partial sums - invterm = @fastmath inv(term) - Vec{4, T}((reim(out * invterm)..., reim(invterm)...)) - end - end - return levin_transform(l) * sqrt(π / 2x) - end - ) -end diff --git a/test/besselk_test.jl b/test/besselk_test.jl index 76e0bc0..335cdd8 100644 --- a/test/besselk_test.jl +++ b/test/besselk_test.jl @@ -165,3 +165,9 @@ end (v, x) = -14.6, -10.6 #@test besselk(v,x) ≈ -0.0180385087581148387140033906859 - 1.54653251445680014758965158559*im + + +# test besselk_levin for real and complex + +@test Bessels.besselkx_levin(1.1, 2.4, Val(16)) ≈ SpecialFunctions.besselkx(1.1, 2.4) +@test Bessels.besselkx_levin(1.1, 2.4 + 1.1im, Val(16)) ≈ SpecialFunctions.besselkx(1.1, 2.4 + 1.1im) From 2574f49bfc79907a51dfc9bb130d6bae3a06aeef Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Wed, 26 Apr 2023 13:46:18 -0400 Subject: [PATCH 10/19] reorg into modules --- src/AiryFunctions/AiryFunctions.jl | 17 ++++++ src/{Airy => AiryFunctions}/airy.jl | 0 src/{Airy => AiryFunctions}/cairy.jl | 24 ++++---- src/BesselFunctions/BesselFunctions.jl | 60 +++++++++++++++++++ src/{ => BesselFunctions}/Float128/besselj.jl | 0 src/{ => BesselFunctions}/Float128/besselk.jl | 0 src/{ => BesselFunctions}/Float128/bessely.jl | 0 .../Float128/constants.jl | 0 .../Polynomials/besselj_polys.jl | 0 src/{ => BesselFunctions}/U_polynomials.jl | 0 src/{ => BesselFunctions}/asymptotics.jl | 0 src/{ => BesselFunctions}/besseli.jl | 0 src/{ => BesselFunctions}/besselj.jl | 0 src/{ => BesselFunctions}/besselk.jl | 0 src/{ => BesselFunctions}/bessely.jl | 11 ---- src/{ => BesselFunctions}/constants.jl | 0 src/{ => BesselFunctions}/hankel.jl | 0 .../modifiedsphericalbessel.jl | 0 src/{ => BesselFunctions}/recurrence.jl | 0 src/{ => BesselFunctions}/sphericalbessel.jl | 0 src/Bessels.jl | 35 +++-------- src/Float128/besseli.jl | 0 src/GammaFunctions/GammaFunctions.jl | 9 +++ src/{ => GammaFunctions}/gamma.jl | 0 src/{math.jl => Math/Math.jl} | 40 ++++++++++++- src/{ => Math}/math_constants.jl | 5 ++ src/precompile.jl | 1 + test/besselj_test.jl | 22 +++---- test/besselk_test.jl | 4 +- test/bessely_test.jl | 6 +- 30 files changed, 166 insertions(+), 68 deletions(-) create mode 100644 src/AiryFunctions/AiryFunctions.jl rename src/{Airy => AiryFunctions}/airy.jl (100%) rename src/{Airy => AiryFunctions}/cairy.jl (97%) create mode 100644 src/BesselFunctions/BesselFunctions.jl rename src/{ => BesselFunctions}/Float128/besselj.jl (100%) rename src/{ => BesselFunctions}/Float128/besselk.jl (100%) rename src/{ => BesselFunctions}/Float128/bessely.jl (100%) rename src/{ => BesselFunctions}/Float128/constants.jl (100%) rename src/{ => BesselFunctions}/Polynomials/besselj_polys.jl (100%) rename src/{ => BesselFunctions}/U_polynomials.jl (100%) rename src/{ => BesselFunctions}/asymptotics.jl (100%) rename src/{ => BesselFunctions}/besseli.jl (100%) rename src/{ => BesselFunctions}/besselj.jl (100%) rename src/{ => BesselFunctions}/besselk.jl (100%) rename src/{ => BesselFunctions}/bessely.jl (99%) rename src/{ => BesselFunctions}/constants.jl (100%) rename src/{ => BesselFunctions}/hankel.jl (100%) rename src/{ => BesselFunctions}/modifiedsphericalbessel.jl (100%) rename src/{ => BesselFunctions}/recurrence.jl (100%) rename src/{ => BesselFunctions}/sphericalbessel.jl (100%) delete mode 100644 src/Float128/besseli.jl create mode 100644 src/GammaFunctions/GammaFunctions.jl rename src/{ => GammaFunctions}/gamma.jl (100%) rename src/{math.jl => Math/Math.jl} (81%) rename src/{ => Math}/math_constants.jl (92%) diff --git a/src/AiryFunctions/AiryFunctions.jl b/src/AiryFunctions/AiryFunctions.jl new file mode 100644 index 0000000..b489fe3 --- /dev/null +++ b/src/AiryFunctions/AiryFunctions.jl @@ -0,0 +1,17 @@ +module AiryFunctions + +using ..Math + +export airyai +export airyaix +export airyaiprime +export airyaiprimex +export airybi +export airybix +export airybiprime +export airybiprimex + +include("airy.jl") +include("cairy.jl") + +end \ No newline at end of file diff --git a/src/Airy/airy.jl b/src/AiryFunctions/airy.jl similarity index 100% rename from src/Airy/airy.jl rename to src/AiryFunctions/airy.jl diff --git a/src/Airy/cairy.jl b/src/AiryFunctions/cairy.jl similarity index 97% rename from src/Airy/cairy.jl rename to src/AiryFunctions/cairy.jl index e0f9d76..642008b 100644 --- a/src/Airy/cairy.jl +++ b/src/AiryFunctions/cairy.jl @@ -268,7 +268,7 @@ end function airyai_power_series(z::Complex{T}) where T z2 = z * z z3 = z2 * z - p = SIMDMath.horner_simd(z3, pack_AIRYAI_POW_COEF) + p = horner_simd(z3, pack_AIRYAI_POW_COEF) # TODO: use complex shufflevector to SIMD this muladd ai = muladd(-p[2], z, p[1]) aip = muladd(p[4], z2, -p[3]) @@ -278,7 +278,7 @@ end function airybi_power_series(z::Complex{T}) where T z2 = z * z z3 = z2 * z - p = SIMDMath.horner_simd(z3, pack_AIRYBI_POW_COEF) + p = horner_simd(z3, pack_AIRYBI_POW_COEF) bi = muladd(p[2], z, p[1]) bip = muladd(p[4], z2, p[3]) return bi, bip @@ -356,15 +356,15 @@ end @inline function compute_airy_asy_coef(z, xsqrx) invx3 = @fastmath inv(z^3) - p = SIMDMath.horner_simd(invx3, pack_AIRY_ASYM_COEF) + p = horner_simd(invx3, pack_AIRY_ASYM_COEF) - pvec1 = SIMDMath.ComplexVec((p[1], p[3])) - pvec2 = SIMDMath.ComplexVec((p[2], p[4])) + pvec1 = ComplexVec((p[1], p[3])) + pvec2 = ComplexVec((p[2], p[4])) - zvec = SIMDMath.ComplexVec{2, Float64}((xsqrx.re, xsqrx.re), (xsqrx.im, xsqrx.im)) - zvec = SIMDMath.fmul(zvec, pvec2) - a = SIMDMath.fadd(pvec1, zvec) - b = SIMDMath.fsub(pvec1, zvec) + zvec = ComplexVec{2, Float64}((xsqrx.re, xsqrx.re), (xsqrx.im, xsqrx.im)) + zvec = fmul(zvec, pvec2) + a = fadd(pvec1, zvec) + b = fsub(pvec1, zvec) A, B, C, D = b[1], a[1], b[2], a[2] return A, B, C, D @@ -485,7 +485,7 @@ airyai_levin_cutoff(x::T, y::T) where T <: Union{Float32, Float64} = x > 2.0 || # tuple(Float64.([3^k * gamma(k + 1//6) * gamma(k + 5//6) / (2^(2k+2) * gamma(k+1)) for k in big"0":big"24"])...) # tuple(Float64.([(3)^k * gamma(k - 1//6) * gamma(k + 7//6) / (2^(2k+2) * gamma(k+1)) for k in big"0":big"24"])...) -const pack_AIRY_ASYM_COEF = SIMDMath.pack_poly(( +const pack_AIRY_ASYM_COEF = pack_poly(( (1.5707963267948966, 0.13124057851910487, 0.4584353787485384, 5.217255928936184, 123.97197893818594, 5038.313653002081, 312467.7049060495, 2.746439545069411e7, 3.2482560591146026e9, 4.97462635569055e11, 9.57732308323407e13, 2.2640712393216476e16, 6.447503420809101e18), (0.1636246173744684, 0.20141783231057064, 1.3848568733028765, 23.555289417250567, 745.2667344964557, 37835.063701047824, 2.8147130917899106e6, 2.8856687720069575e8, 3.8998976239149216e10, 6.718472897263214e12, 1.4370735281142392e15, 3.7367429394637446e17, 1.1608192617215053e20), (-1.5707963267948966, 0.15510250188621483, 0.4982993247266722, 5.515384839161109, 129.24738229725767, 5209.103946324185, 321269.61208650155, 2.812618811215662e7, 3.3166403972012258e9, 5.0676100258903735e11, 9.738286496397669e13, 2.298637212441062e16, 6.537678293827411e18), @@ -507,14 +507,14 @@ const pack_AIRY_ASYM_COEF = SIMDMath.pack_poly(( # tuple(Float64.([3^(-2k + 1//6) / (gamma(k + 1//3) * gamma(k+1)) for k in big"0":big"31"])...) # tuple(Float64.([3^(-2k - 7//6) / (gamma(k + 5//3) * gamma(k+1)) for k in big"0":big"31"])...) -const pack_AIRYAI_POW_COEF = SIMDMath.pack_poly(( +const pack_AIRYAI_POW_COEF = pack_poly(( (0.3550280538878172, 0.05917134231463621, 0.00197237807715454, 2.7394139960479725e-5, 2.0753136333696763e-7, 9.882445873188935e-10, 3.2295574748983444e-12, 7.689422559281772e-15, 1.3930113332032197e-17, 1.984346628494615e-20, 2.280858193671971e-23, 2.159903592492397e-26, 1.7142092003907912e-29, 1.156686370034272e-32, 6.717110162800651e-36, 3.392479880202349e-39, 1.5037588121464314e-42, 5.897093380966397e-46, 2.060479867563381e-49, 6.455137429709841e-53, 1.8234851496355484e-56, 4.6684207619957715e-60, 1.0882099678311822e-63, 2.3192880814816327e-67, 4.536948516200377e-71, 8.174682011171851e-75, 1.3610859159460292e-78, 2.1004412283117732e-82, 3.0126810503611208e-86, 4.026571839563112e-90, 5.026931135534472e-94, 5.875328582906115e-98), (0.2588194037928068, 0.021568283649400565, 0.0005135305630809659, 5.705895145344065e-6, 3.657625093169273e-8, 1.5240104554871968e-10, 4.456170922477184e-13, 9.645391607093471e-16, 1.6075652678489119e-18, 2.1264090844562326e-21, 2.286461381135734e-24, 2.0378443682136668e-27, 1.5299131893495997e-30, 9.8071358291641e-34, 5.430307768086435e-37, 2.623337086032094e-40, 1.1153644073265705e-43, 4.2057481422570536e-47, 1.4160768155747653e-50, 4.2833539491069734e-54, 1.1703152866412495e-57, 2.902567675201512e-61, 6.56392509091251e-65, 1.3589907020522795e-68, 2.585598748196879e-72, 4.536138154731366e-76, 7.361470552955803e-80, 1.108321372019844e-83, 1.5522708291594454e-87, 2.0275219816607177e-91, 2.4756068152145513e-95, 2.8318540553815505e-99), (0.2588194037928068, 0.08627313459760226, 0.003594713941566761, 5.705895145344065e-5, 4.7549126211200545e-7, 2.438416728779515e-9, 8.46672475270665e-12, 2.1219861535605637e-14, 4.0189131696222797e-17, 5.953945436477451e-20, 7.088030281520775e-23, 6.928670851926468e-26, 5.660678800593519e-29, 3.9228543316656404e-32, 2.335032340277167e-35, 1.206735059574763e-38, 5.4652855959001957e-42, 2.1869890339736677e-45, 7.78842248566121e-49, 2.4843452904820447e-52, 7.138923248511622e-56, 1.8576433121289676e-59, 4.397829810911382e-63, 9.512934914365956e-67, 1.8874870861837215e-70, 3.4474649975958385e-74, 5.815561736835084e-78, 9.08823525056272e-82, 1.3194302047855285e-85, 1.7842193438614314e-89, 2.2528022018452416e-93, 2.6619428120586576e-97), (0.1775140269439086, 0.011834268462927242, 0.0002465472596443175, 2.4903763600436113e-6, 1.48236688097834e-8, 5.81320345481702e-11, 1.6147787374491722e-13, 3.3432271996877272e-16, 5.35773589693546e-19, 6.842574581015913e-22, 7.12768185522491e-25, 6.171153121406848e-28, 4.511076843133661e-31, 2.8211862683762735e-34, 1.526615946091057e-37, 7.21804229830287e-41, 3.0075176242928623e-44, 1.1126591284842257e-47, 3.6794283349346094e-51, 1.094091089781329e-54, 2.941105080057336e-58, 7.182185787685802e-62, 1.6003087762223267e-65, 3.2666029316642714e-69, 6.131011508378888e-73, 1.0616470144379027e-76, 1.7013573949325364e-80, 2.5306520823033415e-84, 3.5031175004199077e-88, 4.5242380219810255e-92, 5.4640555821026875e-96, 6.184556403059069e-100) )) -const pack_AIRYBI_POW_COEF = SIMDMath.pack_poly(( +const pack_AIRYBI_POW_COEF = pack_poly(( (0.6149266274460007, 0.10248777124100013, 0.0034162590413666706, 4.7448042241203764e-5, 3.5945486546366485e-7, 1.7116898355412612e-9, 5.5937576324877815e-12, 1.3318470553542337e-14, 2.412766404627235e-17, 3.4369891803806764e-20, 3.9505622762996283e-23, 3.7410627616473754e-26, 2.9690974298788695e-29, 2.0034395613217741e-32, 1.163437608200798e-35, 5.875947516165647e-39, 2.604586664967042e-42, 1.0214065352811929e-45, 3.568855818592568e-49, 1.1180625998097016e-52, 3.1583689260161066e-56, 8.08594195088609e-60, 1.884834953586501e-63, 4.017124794515134e-67, 7.858225341383282e-71, 1.415896457906898e-74, 2.3574699598849448e-78, 3.6380709257483713e-82, 5.2181166462254325e-86, 6.9742270064493885e-90, 8.706900132895616e-94, 1.0176367616755045e-97), (0.4482883573538264, 0.03735736311281886, 0.0008894610264956872, 9.882900294396524e-6, 6.335192496408029e-8, 2.6396635401700117e-10, 7.718314444941556e-13, 1.6706308322384318e-15, 2.7843847203973864e-18, 3.683048571954215e-21, 3.960267281671199e-24, 3.52964998366417e-27, 2.6498873751232507e-30, 1.698645753284135e-33, 9.405568955061657e-37, 4.5437531183872734e-40, 1.9318678224435686e-43, 7.284569466227635e-47, 2.4527169919958367e-50, 7.418986666654073e-54, 2.0270455373371784e-57, 5.0273946858560976e-61, 1.136905175453663e-64, 2.353840942968246e-68, 4.478388399863482e-72, 7.856821754146459e-76, 1.275044101614161e-79, 1.919668927452817e-83, 2.688611943211228e-87, 3.511771085699096e-91, 4.28787678351538e-95, 4.904915103540814e-99), (0.4482883573538264, 0.14942945245127545, 0.0062262271854698105, 9.882900294396525e-5, 8.235750245330437e-7, 4.223461664272019e-9, 1.4664797445388956e-11, 3.67538783092455e-14, 6.960961800993466e-17, 1.0312536001471802e-19, 1.2276828573180715e-22, 1.2000809944458178e-25, 9.804583287956028e-29, 6.79458301313654e-32, 4.0443946506765124e-35, 2.0901264344581458e-38, 9.466152329973487e-42, 3.78797612243837e-45, 1.3489943455977101e-48, 4.3030122666593625e-52, 1.236497777775679e-55, 3.2175325989479025e-59, 7.617264675539542e-63, 1.6476886600777724e-66, 3.269223531900342e-70, 5.97118453315131e-74, 1.007284840275187e-77, 1.5741285205113097e-81, 2.2853201517295438e-85, 3.0903585554152047e-89, 3.901967872998996e-93, 4.6106201973283655e-97), diff --git a/src/BesselFunctions/BesselFunctions.jl b/src/BesselFunctions/BesselFunctions.jl new file mode 100644 index 0000000..3a8a8e0 --- /dev/null +++ b/src/BesselFunctions/BesselFunctions.jl @@ -0,0 +1,60 @@ +module BesselFunctions + +using ..GammaFunctions +using ..Math + +export besselj0 +export besselj1 +export besselj +export besselj! + +export bessely0 +export bessely1 +export bessely +export bessely! + +export sphericalbesselj +export sphericalbessely +export sphericalbesseli +export sphericalbesselk + +export besseli +export besselix +export besseli0 +export besseli0x +export besseli1 +export besseli1x +export besseli! + +export besselk +export besselkx +export besselk0 +export besselk0x +export besselk1 +export besselk1x +export besselk! + +export besselh +export hankelh1 +export hankelh2 + +include("besseli.jl") +include("besselj.jl") +include("besselk.jl") +include("bessely.jl") +include("hankel.jl") +include("sphericalbessel.jl") +include("modifiedsphericalbessel.jl") + + +include("constants.jl") +include("U_polynomials.jl") +include("recurrence.jl") +include("Polynomials/besselj_polys.jl") +include("asymptotics.jl") + +include("Float128/constants.jl") +include("Float128/besselj.jl") +include("Float128/bessely.jl") + +end \ No newline at end of file diff --git a/src/Float128/besselj.jl b/src/BesselFunctions/Float128/besselj.jl similarity index 100% rename from src/Float128/besselj.jl rename to src/BesselFunctions/Float128/besselj.jl diff --git a/src/Float128/besselk.jl b/src/BesselFunctions/Float128/besselk.jl similarity index 100% rename from src/Float128/besselk.jl rename to src/BesselFunctions/Float128/besselk.jl diff --git a/src/Float128/bessely.jl b/src/BesselFunctions/Float128/bessely.jl similarity index 100% rename from src/Float128/bessely.jl rename to src/BesselFunctions/Float128/bessely.jl diff --git a/src/Float128/constants.jl b/src/BesselFunctions/Float128/constants.jl similarity index 100% rename from src/Float128/constants.jl rename to src/BesselFunctions/Float128/constants.jl diff --git a/src/Polynomials/besselj_polys.jl b/src/BesselFunctions/Polynomials/besselj_polys.jl similarity index 100% rename from src/Polynomials/besselj_polys.jl rename to src/BesselFunctions/Polynomials/besselj_polys.jl diff --git a/src/U_polynomials.jl b/src/BesselFunctions/U_polynomials.jl similarity index 100% rename from src/U_polynomials.jl rename to src/BesselFunctions/U_polynomials.jl diff --git a/src/asymptotics.jl b/src/BesselFunctions/asymptotics.jl similarity index 100% rename from src/asymptotics.jl rename to src/BesselFunctions/asymptotics.jl diff --git a/src/besseli.jl b/src/BesselFunctions/besseli.jl similarity index 100% rename from src/besseli.jl rename to src/BesselFunctions/besseli.jl diff --git a/src/besselj.jl b/src/BesselFunctions/besselj.jl similarity index 100% rename from src/besselj.jl rename to src/BesselFunctions/besselj.jl diff --git a/src/besselk.jl b/src/BesselFunctions/besselk.jl similarity index 100% rename from src/besselk.jl rename to src/BesselFunctions/besselk.jl diff --git a/src/bessely.jl b/src/BesselFunctions/bessely.jl similarity index 99% rename from src/bessely.jl rename to src/BesselFunctions/bessely.jl index cfaac31..09cd2bd 100644 --- a/src/bessely.jl +++ b/src/BesselFunctions/bessely.jl @@ -488,17 +488,6 @@ function bessely_chebyshev_low_orders(v, x) return clenshaw_chebyshev(v1, a), clenshaw_chebyshev(v2, a) end -# uses the Clenshaw algorithm to recursively evaluate a linear combination of Chebyshev polynomials -function clenshaw_chebyshev(x, c) - x2 = 2x - c0 = c[end-1] - c1 = c[end] - for i in length(c)-2:-1:1 - c0, c1 = c[i] - c1, c0 + c1 * x2 - end - return c0 + c1 * x -end - # to generate the Chebyshev weights #= using ArbNumerics, FastChebInterp diff --git a/src/constants.jl b/src/BesselFunctions/constants.jl similarity index 100% rename from src/constants.jl rename to src/BesselFunctions/constants.jl diff --git a/src/hankel.jl b/src/BesselFunctions/hankel.jl similarity index 100% rename from src/hankel.jl rename to src/BesselFunctions/hankel.jl diff --git a/src/modifiedsphericalbessel.jl b/src/BesselFunctions/modifiedsphericalbessel.jl similarity index 100% rename from src/modifiedsphericalbessel.jl rename to src/BesselFunctions/modifiedsphericalbessel.jl diff --git a/src/recurrence.jl b/src/BesselFunctions/recurrence.jl similarity index 100% rename from src/recurrence.jl rename to src/BesselFunctions/recurrence.jl diff --git a/src/sphericalbessel.jl b/src/BesselFunctions/sphericalbessel.jl similarity index 100% rename from src/sphericalbessel.jl rename to src/BesselFunctions/sphericalbessel.jl diff --git a/src/Bessels.jl b/src/Bessels.jl index 9ca2c57..48a145c 100644 --- a/src/Bessels.jl +++ b/src/Bessels.jl @@ -5,10 +5,12 @@ using SIMDMath export besselj0 export besselj1 export besselj +export besselj! export bessely0 export bessely1 export bessely +export bessely! export sphericalbesselj export sphericalbessely @@ -40,33 +42,14 @@ export airybix export airybiprime export airybiprimex -const ComplexOrReal{T} = Union{T,Complex{T}} +include("Math/Math.jl") +include("GammaFunctions/GammaFunctions.jl") +include("AiryFunctions/AiryFunctions.jl") +include("BesselFunctions/BesselFunctions.jl") -include("math.jl") -include("constants.jl") -include("math_constants.jl") - -include("gamma.jl") -include("besseli.jl") -include("besselj.jl") -include("besselk.jl") -include("bessely.jl") -include("hankel.jl") -include("Airy/airy.jl") -include("Airy/cairy.jl") -include("sphericalbessel.jl") -include("modifiedsphericalbessel.jl") - -include("Float128/besseli.jl") -include("Float128/besselj.jl") -include("Float128/besselk.jl") -include("Float128/bessely.jl") -include("Float128/constants.jl") - -include("U_polynomials.jl") -include("recurrence.jl") -include("Polynomials/besselj_polys.jl") -include("asymptotics.jl") +using .GammaFunctions +using .AiryFunctions +using .BesselFunctions include("precompile.jl") diff --git a/src/Float128/besseli.jl b/src/Float128/besseli.jl deleted file mode 100644 index e69de29..0000000 diff --git a/src/GammaFunctions/GammaFunctions.jl b/src/GammaFunctions/GammaFunctions.jl new file mode 100644 index 0000000..39a0399 --- /dev/null +++ b/src/GammaFunctions/GammaFunctions.jl @@ -0,0 +1,9 @@ +module GammaFunctions + +using ..Math: SQ2PI + +export gamma + +include("gamma.jl") + +end diff --git a/src/gamma.jl b/src/GammaFunctions/gamma.jl similarity index 100% rename from src/gamma.jl rename to src/GammaFunctions/gamma.jl diff --git a/src/math.jl b/src/Math/Math.jl similarity index 81% rename from src/math.jl rename to src/Math/Math.jl index 0d57396..fb97b41 100644 --- a/src/math.jl +++ b/src/Math/Math.jl @@ -1,5 +1,29 @@ +module Math + +const ComplexOrReal{T} = Union{T,Complex{T}} + +include("math_constants.jl") + +export ComplexOrReal + +export sin_sum, cos_sum +export levin_transform +export clenshaw_chebyshev + +export Vec, ComplexVec, FloatTypes +export fmadd, fmul, fadd, fsub +export horner_simd, pack_poly + +export @nexprs, @ntuple + using Base.Math: sin_kernel, cos_kernel, sincos_kernel, rem_pio2_kernel, DoubleFloat64, DoubleFloat32 +using Base.Cartesian: @nexprs, @ntuple + +using SIMDMath: Vec, ComplexVec, FloatTypes +using SIMDMath: fmadd, fmul, fadd, fsub +using SIMDMath: horner_simd, pack_poly + """ computes sin(sum(xs)) where xs are sorted by absolute value Doing this is much more accurate than the naive sin(sum(xs)) @@ -91,10 +115,18 @@ function rem_pio2_sum(xs::Vararg{Float16, N}) where N return n, DoubleFloat32(y.hi) end -# Levin's Sequence transformation +# uses the Clenshaw algorithm to recursively evaluate a linear combination of Chebyshev polynomials +function clenshaw_chebyshev(x, c) + x2 = 2x + c0 = c[end-1] + c1 = c[end] + for i in length(c)-2:-1:1 + c0, c1 = c[i] - c1, c0 + c1 * x2 + end + return c0 + c1 * x +end -using Base.Cartesian -using SIMDMath: fmadd, Vec, FloatTypes +# Levin's Sequence transformation #@inline levin_scale(B::T, n, k) where T = -(B + n) * (B + n + k)^(k - one(T)) / (B + n + k + one(T))^k @inline levin_scale(B::T, n, k) where T = -(B + n + k) * (B + n + k - 1) / ((B + n + 2k) * (B + n + 2k - 1)) @@ -122,3 +154,5 @@ end end ) end + +end \ No newline at end of file diff --git a/src/math_constants.jl b/src/Math/math_constants.jl similarity index 92% rename from src/math_constants.jl rename to src/Math/math_constants.jl index 1dca3d3..03782ca 100644 --- a/src/math_constants.jl +++ b/src/Math/math_constants.jl @@ -1,3 +1,8 @@ +export ONEOSQPI, TWOOPI, PIO2, SQPIO2, SQ1O2PI, SQ2OPI +export THPIO4, SQ2PI, PIPOW3O2, PIO4, SQ2O2 + +export GAMMA_TWO_THIRDS, GAMMA_ONE_THIRD, GAMMA_ONE_SIXTH, GAMMA_FIVE_SIXTHS + const ONEOSQPI(::Type{BigFloat}) = big"5.6418958354775628694807945156077258584405E-1" const TWOOPI(::Type{BigFloat}) = big"6.3661977236758134307553505349005744813784E-1" const PIO2(::Type{BigFloat}) = big"1.570796326794896619231321691639751442098584699687552910487472296153908203143099" diff --git a/src/precompile.jl b/src/precompile.jl index 1c313ce..a35a881 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -1,3 +1,4 @@ precompile(besselj, (Float64, Float64)) +precompile(bessely, (Float64, Float64)) precompile(airyai, (Float64,)) precompile(airyai, (ComplexF64,)) diff --git a/test/besselj_test.jl b/test/besselj_test.jl index 523010a..3511b76 100644 --- a/test/besselj_test.jl +++ b/test/besselj_test.jl @@ -102,8 +102,8 @@ for v in nu, xx in x xx *= v sf = SpecialFunctions.besselj(BigFloat(v), BigFloat(xx)) @test isapprox(besselj(v, xx), Float64(sf), rtol=5e-14) - @test isapprox(Bessels.besseljy_positive_args(v, xx)[1], Float64(sf), rtol=5e-14) - @test isapprox(Bessels.besselj(Float32(v), Float32(xx)), Float32(sf)) + @test isapprox(Bessels.BesselFunctions.besseljy_positive_args(v, xx)[1], Float64(sf), rtol=5e-14) + @test isapprox(besselj(Float32(v), Float32(xx)), Float32(sf)) end # test half orders (SpecialFunctions does not give big float precision) @@ -115,8 +115,8 @@ for v in nu, xx in x xx *= v sf = SpecialFunctions.besselj(v, xx) @test isapprox(besselj(v, xx), sf, rtol=1e-12) - @test isapprox(Bessels.besseljy_positive_args(v, xx)[1], sf, rtol=1e-12) - @test isapprox(Bessels.besselj(Float32(v), Float32(xx)), Float32(sf)) + @test isapprox(Bessels.BesselFunctions.besseljy_positive_args(v, xx)[1], sf, rtol=1e-12) + @test isapprox(besselj(Float32(v), Float32(xx)), Float32(sf)) end ## test large orders @@ -126,7 +126,7 @@ for v in nu, xx in x xx *= v sf = SpecialFunctions.besselj(v, xx) @test isapprox(besselj(v, xx), sf, rtol=5e-11) - @test isapprox(Bessels.besseljy_positive_args(v, xx)[1], Float64(sf), rtol=5e-11) + @test isapprox(Bessels.BesselFunctions.besseljy_positive_args(v, xx)[1], Float64(sf), rtol=5e-11) end # test nu_range @@ -135,7 +135,7 @@ end @test besselj(0.5:1:150.5, 2.0) ≈ SpecialFunctions.besselj.(0.5:1:150.5, 2.0) rtol=1e-11 @test besselj(0.5:1:10.5, 40.0) ≈ SpecialFunctions.besselj.(0.5:1:10.5, 40.0) rtol=1e-11 @test besselj(3:4, 1.2) ≈ SpecialFunctions.besselj.(3:4, 1.2) -@test Bessels.besselj!(zeros(Float64, 10), 1:10, 1.0) ≈ besselj(1:10, 1.0) +@test besselj!(zeros(Float64, 10), 1:10, 1.0) ≈ besselj(1:10, 1.0) # test Float16 and Float32 @test besselj(Int16(10), Float16(1.0)) isa Float16 @@ -143,13 +143,13 @@ end ## test large arguments @test isapprox(besselj(10.0, 150.0), SpecialFunctions.besselj(10.0, 150.0), rtol=1e-12) -@test isapprox(Bessels.besseljy_large_argument(15.0, 100.0)[1], SpecialFunctions.besselj(15.0, 100.0), rtol=1e-12) -@test isapprox(Bessels.besseljy_large_argument(15.0, 45.0)[1], SpecialFunctions.besselj(15.0, 45.0), rtol=1e-12) -@test isapprox(Bessels.besseljy_large_argument(15.0, 25.5)[1], SpecialFunctions.besselj(15.0, 25.5), rtol=1e-12) +@test isapprox(Bessels.BesselFunctions.besseljy_large_argument(15.0, 100.0)[1], SpecialFunctions.besselj(15.0, 100.0), rtol=1e-12) +@test isapprox(Bessels.BesselFunctions.besseljy_large_argument(15.0, 45.0)[1], SpecialFunctions.besselj(15.0, 45.0), rtol=1e-12) +@test isapprox(Bessels.BesselFunctions.besseljy_large_argument(15.0, 25.5)[1], SpecialFunctions.besselj(15.0, 25.5), rtol=1e-12) # test BigFloat for single point -@test isapprox(Bessels.besseljy_debye(big"2000", big"1500.0")[1], SpecialFunctions.besselj(big"2000", big"1500"), rtol=5e-20) -@test isapprox(Bessels.besseljy_large_argument(big"20", big"1500.0")[1], SpecialFunctions.besselj(big"20", big"1500"), rtol=5e-20) +@test isapprox(Bessels.BesselFunctions.besseljy_debye(big"2000", big"1500.0")[1], SpecialFunctions.besselj(big"2000", big"1500"), rtol=5e-20) +@test isapprox(Bessels.BesselFunctions.besseljy_large_argument(big"20", big"1500.0")[1], SpecialFunctions.besselj(big"20", big"1500"), rtol=5e-20) # need to test accuracy of negative orders and negative arguments and all combinations within # SpecialFunctions.jl doesn't provide these so will hand check against hard values diff --git a/test/besselk_test.jl b/test/besselk_test.jl index 335cdd8..ba0e851 100644 --- a/test/besselk_test.jl +++ b/test/besselk_test.jl @@ -169,5 +169,5 @@ end # test besselk_levin for real and complex -@test Bessels.besselkx_levin(1.1, 2.4, Val(16)) ≈ SpecialFunctions.besselkx(1.1, 2.4) -@test Bessels.besselkx_levin(1.1, 2.4 + 1.1im, Val(16)) ≈ SpecialFunctions.besselkx(1.1, 2.4 + 1.1im) +@test Bessels.BesselFunctions.besselkx_levin(1.1, 2.4, Val(16)) ≈ SpecialFunctions.besselkx(1.1, 2.4) +@test Bessels.BesselFunctions.besselkx_levin(1.1, 2.4 + 1.1im, Val(16)) ≈ SpecialFunctions.besselkx(1.1, 2.4 + 1.1im) diff --git a/test/bessely_test.jl b/test/bessely_test.jl index 618bbc6..4ff3834 100644 --- a/test/bessely_test.jl +++ b/test/bessely_test.jl @@ -82,7 +82,7 @@ for v in nu, xx in x xx *= v sf = SpecialFunctions.bessely(BigFloat(v), BigFloat(xx)) @test isapprox(bessely(v, xx), Float64(sf), rtol=2e-13) - @test isapprox(Bessels.besseljy_positive_args(v, xx)[2], Float64(sf), rtol=5e-12) + @test isapprox(Bessels.BesselFunctions.besseljy_positive_args(v, xx)[2], Float64(sf), rtol=5e-12) @test isapprox(bessely(Float32(v), Float32(xx)), Float32(sf)) end @@ -95,7 +95,7 @@ for v in nu, xx in x xx *= v sf = SpecialFunctions.bessely(v, xx) @test isapprox(bessely(v, xx), sf, rtol=5e-12) - @test isapprox(Bessels.besseljy_positive_args(v, xx)[2], SpecialFunctions.bessely(v, xx), rtol=5e-12) + @test isapprox(Bessels.BesselFunctions.besseljy_positive_args(v, xx)[2], SpecialFunctions.bessely(v, xx), rtol=5e-12) @test isapprox(bessely(Float32(v), Float32(xx)), Float32(sf)) end @@ -104,7 +104,7 @@ end @test bessely(0:50, 100.0) ≈ SpecialFunctions.bessely.(0:50, 100.0) rtol=1e-11 @test bessely(0.5:1:10.5, 2.0) ≈ SpecialFunctions.bessely.(0.5:1:10.5, 2.0) rtol=1e-11 @test bessely(0.5:1:10.5, 40.0) ≈ SpecialFunctions.bessely.(0.5:1:10.5, 40.0) rtol=1e-11 -@test Bessels.bessely!(zeros(Float64, 10), 1:10, 1.0) ≈ bessely(1:10, 1.0) +@test bessely!(zeros(Float64, 10), 1:10, 1.0) ≈ bessely(1:10, 1.0) # test Float16 @test bessely(Int16(10), Float16(1.0)) isa Float16 From bc399760e94a32ffc143e4795433e7f827c1bee3 Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Wed, 26 Apr 2023 13:53:01 -0400 Subject: [PATCH 11/19] remove SIMDMath at top level --- src/Bessels.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Bessels.jl b/src/Bessels.jl index 48a145c..390810a 100644 --- a/src/Bessels.jl +++ b/src/Bessels.jl @@ -1,7 +1,5 @@ module Bessels -using SIMDMath - export besselj0 export besselj1 export besselj From 8c71a980c8a6cac0d1bc1c861100e3c19184ed38 Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Wed, 26 Apr 2023 14:38:53 -0400 Subject: [PATCH 12/19] improve sequence generation in airybi_levin --- src/AiryFunctions/AiryFunctions.jl | 2 +- src/AiryFunctions/cairy.jl | 81 ++++++++++++++++-------------- 2 files changed, 44 insertions(+), 39 deletions(-) diff --git a/src/AiryFunctions/AiryFunctions.jl b/src/AiryFunctions/AiryFunctions.jl index b489fe3..6b0fb21 100644 --- a/src/AiryFunctions/AiryFunctions.jl +++ b/src/AiryFunctions/AiryFunctions.jl @@ -14,4 +14,4 @@ export airybiprimex include("airy.jl") include("cairy.jl") -end \ No newline at end of file +end diff --git a/src/AiryFunctions/cairy.jl b/src/AiryFunctions/cairy.jl index 642008b..302e8b1 100644 --- a/src/AiryFunctions/cairy.jl +++ b/src/AiryFunctions/cairy.jl @@ -378,17 +378,14 @@ end t = GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4 t2 = 1 / t - a = @fastmath inv(4*xsqrx) a2 = 4 * xsqrx s = zero(typeof(x)) l = @ntuple $N i -> begin s += t - t *= -a * (3 * (i - 5//6) * (i - 1//6) / i) t2 *= -a2 * (i / (3 * (i - 5//6) * (i - 1//6))) - Vec{4, T}((reim(s * t2)..., reim(t2)...)) end return levin_transform(l) / (T(π)^(3//2) * sqrt(xsqr)) @@ -403,17 +400,14 @@ end t = -GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4 t2 = 1 / t - a = @fastmath inv(4*xsqrx) a2 = 4 * xsqrx s = zero(typeof(x)) l = @ntuple $N i -> begin s += t - t *= -a * (3 * (i - 7//6) * (i + 1//6) / i) t2 *= -a2 * (i / (3 * (i - 7//6) * (i + 1//6))) - Vec{4, T}((reim(s * t2)..., reim(t2)...)) end return levin_transform(l) * sqrt(xsqr) / T(π)^(3//2) @@ -424,27 +418,33 @@ end @generated function airybi_levin(x::Complex{T}, ::Val{N}) where {T <: Union{Float32, Float64}, N} :( begin + xsqr = sqrt(x) - out = zero(typeof(x)) - t = GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4 - a = T(0.25) / (xsqr*x) + xsqrx = xsqr * x - l = @ntuple $N i -> begin - out += t - t *= -3 * a * (i - 5//6) * (i - 1//6) / i - invt = @fastmath inv(t) - Vec{4, T}((reim(out * invt)..., reim(invt)...)) - end - out = zero(typeof(x)) t = GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4 - l2 = @ntuple $N i -> begin - out += t - t *= 3 * a * (i - 5//6) * (i - 1//6) / i - invt = @fastmath inv(t) - Vec{4, T}((reim(out * invt)..., reim(invt)...)) + t2 = 1 / t + a = inv(4*xsqrx) + a2 = 4 * xsqrx + + s = zero(typeof(x)) + s2 = zero(typeof(x)) + m = 1 + @nexprs $N i -> begin + s += t + s2 += t * m + t *= -a * (3 * (i - 5//6) * (i - 1//6) / i) + t2 *= -a2 * (i / (3 * (i - 5//6) * (i - 1//6))) + l_{i} = Vec{4, T}((reim(s * t2)..., reim(t2)...)) + w_{i} = Vec{4, T}((reim(s2 * t2 * m)..., reim(t2 * m)...)) + m *= -1 end + + l = @ntuple $N i -> l_{i} + w = @ntuple $N i -> w_{i} + e = exp(-2/3 * x * sqrt(x)) - return @fastmath (e*im*levin_transform(l) + 2*levin_transform(l2)/e) / (sqrt(T(π)^3) * sqrt(xsqr)) + return @fastmath (e*im*levin_transform(l) + 2*levin_transform(w)/e) / (sqrt(T(π)^3) * sqrt(xsqr)) end ) end @@ -453,26 +453,31 @@ end :( begin xsqr = sqrt(x) - out = zero(typeof(x)) - t = GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4 - a = T(0.25) / (xsqr*x) + xsqrx = xsqr * x - l = @ntuple $N i -> begin - out += t - t *= -3 * a * (i - 7//6) * (i + 1//6) / i - invt = @fastmath inv(t) - Vec{4, T}((reim(out * invt)..., reim(invt)...)) - end - out = zero(typeof(x)) t = GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4 - l2 = @ntuple $N i -> begin - out += t - t *= 3 * a * (i - 7//6) * (i + 1//6) / i - invt = @fastmath inv(t) - Vec{4, T}((reim(out * invt)..., reim(invt)...)) + t2 = 1 / t + a = inv(4*xsqrx) + a2 = 4 * xsqrx + + s = zero(typeof(x)) + s2 = zero(typeof(x)) + m = 1 + @nexprs $N i -> begin + s += t + s2 += t * m + t *= -a * (3 * (i - 7//6) * (i + 1//6) / i) + t2 *= -a2 * (i / (3 * (i - 7//6) * (i + 1//6))) + l_{i} = Vec{4, T}((reim(s * t2)..., reim(t2)...)) + w_{i} = Vec{4, T}((reim(s2 * t2 * m)..., reim(t2 * m)...)) + m *= -1 end + + l = @ntuple $N i -> l_{i} + w = @ntuple $N i -> w_{i} + e = exp(-2/3 * x * sqrt(x)) - return @fastmath -(e*im*levin_transform(l) - 2*levin_transform(l2)/e) * sqrt(xsqr) / (sqrt(T(π)^3)) + return -(e*im*levin_transform(l) - 2*levin_transform(w)/e) * sqrt(xsqr) / (sqrt(T(π)^3)) end ) end From 077d01aa4ea9975ae38e6d1671fc6468636cab48 Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Wed, 26 Apr 2023 14:39:44 -0400 Subject: [PATCH 13/19] remove uncessary fastmath --- src/AiryFunctions/cairy.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AiryFunctions/cairy.jl b/src/AiryFunctions/cairy.jl index 302e8b1..ce0ff5c 100644 --- a/src/AiryFunctions/cairy.jl +++ b/src/AiryFunctions/cairy.jl @@ -444,7 +444,7 @@ end w = @ntuple $N i -> w_{i} e = exp(-2/3 * x * sqrt(x)) - return @fastmath (e*im*levin_transform(l) + 2*levin_transform(w)/e) / (sqrt(T(π)^3) * sqrt(xsqr)) + return (e*im*levin_transform(l) + 2*levin_transform(w)/e) / (sqrt(T(π)^3) * sqrt(xsqr)) end ) end From 8d394bae7753f40a12b51042d85b5d41853e64df Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Wed, 26 Apr 2023 15:45:50 -0400 Subject: [PATCH 14/19] cleanup and add scaled airy tests --- src/AiryFunctions/cairy.jl | 105 ++++++++++++++++++++----------------- test/airy_test.jl | 2 + 2 files changed, 60 insertions(+), 47 deletions(-) diff --git a/src/AiryFunctions/cairy.jl b/src/AiryFunctions/cairy.jl index ce0ff5c..0ba3fa4 100644 --- a/src/AiryFunctions/cairy.jl +++ b/src/AiryFunctions/cairy.jl @@ -47,26 +47,20 @@ External links: [DLMF](https://dlmf.nist.gov/9.2.2), [Wikipedia](https://en.wiki See also: [`airyaiprime`](@ref), [`airybi`](@ref) """ airyai(z::Number) = _airyai(float(z)) +airyaix(z::Number) = _airyaix(float(z)) function _airyai(z::Complex{T}) where T <: Union{Float32, Float64} - if ~isfinite(z) - if abs(angle(z)) < T(2π/3) - return exp(-z) - else - return 1 / z - end - end - x, y = reim(z) check_conj = false if y < zero(T) z = conj(z) + y = -y check_conj = true end if airy_large_argument_cutoff(x, y) - r = airyaix_large_args(z)[1] * exp(-T(2/3) * z * sqrt(z)) + r = airyai_large_args(z)[1] elseif airyai_levin_cutoff(x, y) r = airyaix_levin(z, Val(17)) * exp(-T(2/3) * z * sqrt(z)) elseif airyai_power_series_cutoff(x, y) @@ -85,6 +79,7 @@ function _airyaix(z::Complex{T}) where T <: Union{Float32, Float64} check_conj = false if y < zero(T) z = conj(z) + y = -y check_conj = true end @@ -122,26 +117,20 @@ External links: [DLMF](https://dlmf.nist.gov/9.2), [Wikipedia](https://en.wikipe See also: [`airyai`](@ref), [`airybi`](@ref) """ airyaiprime(z::Number) = _airyaiprime(float(z)) +airyaiprimex(z::Number) = _airyaiprimex(float(z)) function _airyaiprime(z::Complex{T}) where T <: Union{Float32, Float64} - if ~isfinite(z) - if abs(angle(z)) < T(2π/3) - return -exp(-z) - else - return 1 / z - end - end - x, y = reim(z) check_conj = false if y < zero(T) z = conj(z) + y = -y check_conj = true end if airy_large_argument_cutoff(x, y) - r = airyaix_large_args(z)[2] * exp(-T(2/3) * z * sqrt(z)) + r = airyai_large_args(z)[2] elseif airyai_levin_cutoff(x, y) r = airyaiprimex_levin(z, Val(17)) * exp(-T(2/3) * z * sqrt(z)) elseif airyai_power_series_cutoff(x, y) @@ -159,6 +148,7 @@ function _airyaiprimex(z::Complex{T}) where T <: Union{Float32, Float64} check_conj = false if y < zero(T) z = conj(z) + y = -y check_conj = true end @@ -198,24 +188,26 @@ See also: [`airybiprime`](@ref), [`airyai`](@ref) airybi(z::Number) = _airybi(float(z)) function _airybi(z::Complex{T}) where T <: Union{Float32, Float64} - if ~isfinite(z) - if abs(angle(z)) < T(2π/3) - return exp(z) - else - return 1 / z - end - end x, y = real(z), imag(z) + check_conj = false + if y < zero(T) + z = conj(z) + y = -y + check_conj = true + end + if airy_large_argument_cutoff(z) - return airybi_large_args(z)[1] + r = airybi_large_args(z)[1] elseif x > 1.1 && y > 4.2 - return airybi_levin(z, Val(16)) + r = airybi_levin(z, Val(16)) elseif angle(z) < 7pi/8 || x > -4.5 - return airybi_power_series(z)[1] + r = airybi_power_series(z)[1] else - return cispi(-1/6) * _airyai(z * cispi(-2/3)) + cispi(1/6) * _airyai(z*cispi(2/3)) + r = cispi(-1/6) * _airyai(z * cispi(-2/3)) + cispi(1/6) * _airyai(z*cispi(2/3)) end + + return check_conj ? conj(r) : r end """ airybiprime(z) @@ -240,24 +232,26 @@ See also: [`airybi`](@ref), [`airyai`](@ref) airybiprime(z::Number) = _airybiprime(float(z)) function _airybiprime(z::Complex{T}) where T <: Union{Float32, Float64} - if ~isfinite(z) - if abs(angle(z)) < T(2π/3) - return exp(z) - else - return -1 / z - end - end x, y = real(z), imag(z) + check_conj = false + if y < zero(T) + z = conj(z) + y = -y + check_conj = true + end + if airy_large_argument_cutoff(z) - return airybi_large_args(z)[2] + r = airybi_large_args(z)[2] elseif x > 1.1 && y > 4.2 - return airybiprime_levin(z, Val(16)) + r = airybiprime_levin(z, Val(16)) elseif angle(z) < 7pi/8 || x > -4.5 - return airybi_power_series(z)[2] + r = airybi_power_series(z)[2] else - return cispi(-5/6) * airyaiprime(z * cispi(-2/3)) + cispi(5/6) * airyaiprime(z*cispi(2/3)) + r = cispi(-5/6) * airyaiprime(z * cispi(-2/3)) + cispi(5/6) * airyaiprime(z*cispi(2/3)) end + + return check_conj ? conj(r) : r end ##### @@ -307,18 +301,26 @@ airy_large_argument_cutoff(z::ComplexOrReal{Float32}) = abs(z) > 4 # valid in 0 <= angle(z) <= pi # use conjugation for bottom half plane -airyai_large_args(z::Complex{T}) where T = airyaix_large_args(z) .* exp(-2/3 * z * sqrt(z)) +function airyai_large_args(z::Complex{T}) where T + out = airyaix_large_args(z) + return isfinite(z) ? out .* exp(-T(2/3) * z * sqrt(z)) : out +end function airybi_large_args(z::Complex{T}) where T - if imag(z) < zero(T) - out = conj.(airybix_large_args(conj(z))) - else - out = airybix_large_args(z) - end - return out .* exp(T(2/3) * z * sqrt(z)) + out = airybix_large_args(z) + return isfinite(z) ? out .* exp(T(2/3) * z * sqrt(z)) : out end @inline function airyaix_large_args(z::Complex{T}) where T + if ~isfinite(z) + if abs(angle(z)) < T(2π/3) + e = exp(-z) + return (e, -e) + else + invz = 1 / z + return (invz, invz) + end + end xsqr = sqrt(z) xsqrx = Base.FastMath.inv_fast(z * xsqr) A, B, C, D = compute_airy_asy_coef(z, xsqrx) @@ -337,6 +339,15 @@ end end @inline function airybix_large_args(z::Complex{T}) where T + if ~isfinite(z) + if abs(angle(z)) < T(2π/3) + e = exp(z) + return (e, e) + else + invz = 1 / z + return (invz, -invz) + end + end xsqr = sqrt(z) xsqrx = Base.FastMath.inv_fast(z * xsqr) A, B, C, D = compute_airy_asy_coef(z, xsqrx) diff --git a/test/airy_test.jl b/test/airy_test.jl index 4422fde..5e0843e 100644 --- a/test/airy_test.jl +++ b/test/airy_test.jl @@ -51,7 +51,9 @@ end for x in [0.0, 0.01, 0.5, 1.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 50.0], a in 0:pi/12:2pi z = x*exp(im*a) @test isapprox(airyai(z), SpecialFunctions.airyai(z), rtol=5e-10) + @test isapprox(airyaix(z), SpecialFunctions.airyaix(z), rtol=5e-10) @test isapprox(airyaiprime(z), SpecialFunctions.airyaiprime(z), rtol=5e-10) + @test isapprox(airyaiprimex(z), SpecialFunctions.airyaiprimex(z), rtol=5e-10) @test isapprox(airybi(z), SpecialFunctions.airybi(z), rtol=5e-10) @test isapprox(airybiprime(z), SpecialFunctions.airybiprime(z), rtol=5e-10) end From 283ee5eec0d80d678856fdea992797fb062c3a1c Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Wed, 26 Apr 2023 15:49:41 -0400 Subject: [PATCH 15/19] switch to isinf --- src/AiryFunctions/cairy.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/AiryFunctions/cairy.jl b/src/AiryFunctions/cairy.jl index 0ba3fa4..5d80376 100644 --- a/src/AiryFunctions/cairy.jl +++ b/src/AiryFunctions/cairy.jl @@ -312,7 +312,7 @@ function airybi_large_args(z::Complex{T}) where T end @inline function airyaix_large_args(z::Complex{T}) where T - if ~isfinite(z) + if isinf(z) if abs(angle(z)) < T(2π/3) e = exp(-z) return (e, -e) @@ -339,7 +339,7 @@ end end @inline function airybix_large_args(z::Complex{T}) where T - if ~isfinite(z) + if isinf(z) if abs(angle(z)) < T(2π/3) e = exp(z) return (e, e) From 5e1da6fd9df81938de4fb03bc06870240182f4b0 Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Wed, 26 Apr 2023 15:56:52 -0400 Subject: [PATCH 16/19] add NaN check --- src/AiryFunctions/cairy.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/AiryFunctions/cairy.jl b/src/AiryFunctions/cairy.jl index 5d80376..1072f34 100644 --- a/src/AiryFunctions/cairy.jl +++ b/src/AiryFunctions/cairy.jl @@ -50,6 +50,7 @@ airyai(z::Number) = _airyai(float(z)) airyaix(z::Number) = _airyaix(float(z)) function _airyai(z::Complex{T}) where T <: Union{Float32, Float64} + isnan(z) && return z x, y = reim(z) check_conj = false @@ -74,6 +75,7 @@ function _airyai(z::Complex{T}) where T <: Union{Float32, Float64} end function _airyaix(z::Complex{T}) where T <: Union{Float32, Float64} + isnan(z) && return z x, y = real(z), imag(z) check_conj = false @@ -120,6 +122,7 @@ airyaiprime(z::Number) = _airyaiprime(float(z)) airyaiprimex(z::Number) = _airyaiprimex(float(z)) function _airyaiprime(z::Complex{T}) where T <: Union{Float32, Float64} + isnan(z) && return z x, y = reim(z) check_conj = false @@ -143,6 +146,7 @@ function _airyaiprime(z::Complex{T}) where T <: Union{Float32, Float64} end function _airyaiprimex(z::Complex{T}) where T <: Union{Float32, Float64} + isnan(z) && return z x, y = reim(z) check_conj = false @@ -188,6 +192,7 @@ See also: [`airybiprime`](@ref), [`airyai`](@ref) airybi(z::Number) = _airybi(float(z)) function _airybi(z::Complex{T}) where T <: Union{Float32, Float64} + isnan(z) && return z x, y = real(z), imag(z) check_conj = false @@ -232,6 +237,7 @@ See also: [`airybi`](@ref), [`airyai`](@ref) airybiprime(z::Number) = _airybiprime(float(z)) function _airybiprime(z::Complex{T}) where T <: Union{Float32, Float64} + isnan(z) && return z x, y = real(z), imag(z) check_conj = false From 077e0ef8d1f4bee3eb979d1f3887c1b6448e4a21 Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Thu, 27 Apr 2023 10:35:28 -0400 Subject: [PATCH 17/19] simplify besselk_levin (use con. prop) --- src/BesselFunctions/besselk.jl | 37 ++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/src/BesselFunctions/besselk.jl b/src/BesselFunctions/besselk.jl index 2286b45..63e037c 100644 --- a/src/BesselFunctions/besselk.jl +++ b/src/BesselFunctions/besselk.jl @@ -578,15 +578,14 @@ end @generated function besselkx_levin(v, x::T, ::Val{N}) where {T <: FloatTypes, N} :( begin - l = let (out, term, fv2) = (one(typeof(x)), one(typeof(x)), 4*v^2) - @ntuple $N i -> begin - offset = muladd(2, i, -1) - term *= muladd(offset, -offset, fv2) / (8 * x * i) - out += term - invterm = inv(term) - Vec{2, T}((out * invterm, invterm)) + s = zero(T) + t = one(T) + l = @ntuple $N i -> begin + s += t + t *= (4*v^2 - (2i - 1)^2) / (8 * x * i) + invterm = inv(t) + Vec{2, T}((s * invterm, invterm)) end - end return levin_transform(l) * sqrt(π / 2x) end ) @@ -595,16 +594,20 @@ end @generated function besselkx_levin(v, x::Complex{T}, ::Val{N}) where {T <: FloatTypes, N} :( begin - l = let (out, term, fv2, invx) = (one(typeof(x)), one(typeof(x)), 4*v^2, inv(8*x)) - @ntuple $N i -> begin - offset = muladd(2, i, -1) - term *= @fastmath muladd(offset, -offset, fv2) * invx / i - out += term - invterm = @fastmath inv(term) - Vec{4, T}((reim(out * invterm)..., reim(invterm)...)) + s = zero(T) + t = one(typeof(x)) + t2 = t + a = @fastmath inv(8*x) + a2 = 8*x + + l = @ntuple $N i -> begin + s += t + b = (4*v^2 - (2i - 1)^2) / i + t *= a * b + t2 *= a2 / b + Vec{4, T}((reim(s * t2)..., reim(t2)...)) end - end return levin_transform(l) * sqrt(π / 2x) end ) -end \ No newline at end of file +end From 47071c203f3d9df9edbf749cd38b81513d1331ec Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Thu, 27 Apr 2023 18:16:05 -0400 Subject: [PATCH 18/19] fix levin for AD --- src/AiryFunctions/cairy.jl | 68 +++++++++++++++++++--------------- src/BesselFunctions/besselk.jl | 25 +++++++------ src/Math/Math.jl | 12 +++--- 3 files changed, 57 insertions(+), 48 deletions(-) diff --git a/src/AiryFunctions/cairy.jl b/src/AiryFunctions/cairy.jl index 1072f34..eaae0ac 100644 --- a/src/AiryFunctions/cairy.jl +++ b/src/AiryFunctions/cairy.jl @@ -398,14 +398,16 @@ end a = @fastmath inv(4*xsqrx) a2 = 4 * xsqrx - s = zero(typeof(x)) - l = @ntuple $N i -> begin - s += t + s_0 = zero(typeof(x)) + @nexprs $N i -> begin + s_{i} = s_{i-1} + t t *= -a * (3 * (i - 5//6) * (i - 1//6) / i) t2 *= -a2 * (i / (3 * (i - 5//6) * (i - 1//6))) - Vec{4, T}((reim(s * t2)..., reim(t2)...)) + w_{i} = t2 end - return levin_transform(l) / (T(π)^(3//2) * sqrt(xsqr)) + sequence = @ntuple $N i -> s_{i} + weights = @ntuple $N i -> w_{i} + return levin_transform(sequence, weights) / (T(π)^(3//2) * sqrt(xsqr)) end ) end @@ -420,14 +422,16 @@ end a = @fastmath inv(4*xsqrx) a2 = 4 * xsqrx - s = zero(typeof(x)) - l = @ntuple $N i -> begin - s += t + s_0 = zero(typeof(x)) + @nexprs $N i -> begin + s_{i} = s_{i-1} + t t *= -a * (3 * (i - 7//6) * (i + 1//6) / i) t2 *= -a2 * (i / (3 * (i - 7//6) * (i + 1//6))) - Vec{4, T}((reim(s * t2)..., reim(t2)...)) + w_{i} = t2 end - return levin_transform(l) * sqrt(xsqr) / T(π)^(3//2) + sequence = @ntuple $N i -> s_{i} + weights = @ntuple $N i -> w_{i} + return levin_transform(sequence, weights) * sqrt(xsqr) / T(π)^(3//2) end ) end @@ -444,24 +448,26 @@ end a = inv(4*xsqrx) a2 = 4 * xsqrx - s = zero(typeof(x)) - s2 = zero(typeof(x)) + s_0 = zero(typeof(x)) + p_0 = zero(typeof(x)) m = 1 @nexprs $N i -> begin - s += t - s2 += t * m + s_{i} = s_{i-1} + t + p_{i} = p_{i-1} + t * m t *= -a * (3 * (i - 5//6) * (i - 1//6) / i) t2 *= -a2 * (i / (3 * (i - 5//6) * (i - 1//6))) - l_{i} = Vec{4, T}((reim(s * t2)..., reim(t2)...)) - w_{i} = Vec{4, T}((reim(s2 * t2 * m)..., reim(t2 * m)...)) + w_{i} = t2 m *= -1 end - l = @ntuple $N i -> l_{i} - w = @ntuple $N i -> w_{i} + sequence1 = @ntuple $N i -> s_{i} + sequence2 = @ntuple $N i -> p_{i} + weights = @ntuple $N i -> w_{i} - e = exp(-2/3 * x * sqrt(x)) - return (e*im*levin_transform(l) + 2*levin_transform(w)/e) / (sqrt(T(π)^3) * sqrt(xsqr)) + e = exp(-T(2/3) * x * sqrt(x)) + l1 = levin_transform(sequence1, weights) + l2 = levin_transform(sequence2, (@ntuple $N i -> weights[i] * (iseven(i) ? 1 : -1))) + return (e * im * l1 + 2 * l2 / e) / (sqrt(T(π)^3) * sqrt(xsqr)) end ) end @@ -477,24 +483,26 @@ end a = inv(4*xsqrx) a2 = 4 * xsqrx - s = zero(typeof(x)) - s2 = zero(typeof(x)) + s_0 = zero(typeof(x)) + p_0 = zero(typeof(x)) m = 1 @nexprs $N i -> begin - s += t - s2 += t * m + s_{i} = s_{i-1} + t + p_{i} = p_{i-1} + t * m t *= -a * (3 * (i - 7//6) * (i + 1//6) / i) t2 *= -a2 * (i / (3 * (i - 7//6) * (i + 1//6))) - l_{i} = Vec{4, T}((reim(s * t2)..., reim(t2)...)) - w_{i} = Vec{4, T}((reim(s2 * t2 * m)..., reim(t2 * m)...)) + w_{i} = t2 m *= -1 end - l = @ntuple $N i -> l_{i} - w = @ntuple $N i -> w_{i} + sequence1 = @ntuple $N i -> s_{i} + sequence2 = @ntuple $N i -> p_{i} + weights = @ntuple $N i -> w_{i} - e = exp(-2/3 * x * sqrt(x)) - return -(e*im*levin_transform(l) - 2*levin_transform(w)/e) * sqrt(xsqr) / (sqrt(T(π)^3)) + l1 = levin_transform(sequence1, weights) + l2 = levin_transform(sequence2, (@ntuple $N i -> weights[i] * (iseven(i) ? 1 : -1))) + e = exp(-T(2/3) * x * sqrt(x)) + return -(e * im * l1 - 2 * l2 / e) * sqrt(xsqr) / (sqrt(T(π)^3)) end ) end diff --git a/src/BesselFunctions/besselk.jl b/src/BesselFunctions/besselk.jl index 63e037c..625aab9 100644 --- a/src/BesselFunctions/besselk.jl +++ b/src/BesselFunctions/besselk.jl @@ -578,15 +578,16 @@ end @generated function besselkx_levin(v, x::T, ::Val{N}) where {T <: FloatTypes, N} :( begin - s = zero(T) + s_0 = zero(T) t = one(T) - l = @ntuple $N i -> begin - s += t + @nexprs $N i -> begin + s_{i} = s_{i-1} + t t *= (4*v^2 - (2i - 1)^2) / (8 * x * i) - invterm = inv(t) - Vec{2, T}((s * invterm, invterm)) + w_{i} = 1 / t end - return levin_transform(l) * sqrt(π / 2x) + sequence = @ntuple $N i -> s_{i} + weights = @ntuple $N i -> w_{i} + return levin_transform(sequence, weights) * sqrt(π / 2x) end ) end @@ -594,20 +595,22 @@ end @generated function besselkx_levin(v, x::Complex{T}, ::Val{N}) where {T <: FloatTypes, N} :( begin - s = zero(T) + s_0 = zero(T) t = one(typeof(x)) t2 = t a = @fastmath inv(8*x) a2 = 8*x - l = @ntuple $N i -> begin - s += t + @nexprs $N i -> begin + s_{i} = s_{i-1} + t b = (4*v^2 - (2i - 1)^2) / i t *= a * b t2 *= a2 / b - Vec{4, T}((reim(s * t2)..., reim(t2)...)) + w_{i} = t2 end - return levin_transform(l) * sqrt(π / 2x) + sequence = @ntuple $N i -> s_{i} + weights = @ntuple $N i -> w_{i} + return levin_transform(sequence, weights) * sqrt(π / 2x) end ) end diff --git a/src/Math/Math.jl b/src/Math/Math.jl index fb97b41..8b1c840 100644 --- a/src/Math/Math.jl +++ b/src/Math/Math.jl @@ -131,28 +131,26 @@ end #@inline levin_scale(B::T, n, k) where T = -(B + n) * (B + n + k)^(k - one(T)) / (B + n + k + one(T))^k @inline levin_scale(B::T, n, k) where T = -(B + n + k) * (B + n + k - 1) / ((B + n + 2k) * (B + n + 2k - 1)) -# implementation for real numbers -@inline @generated function levin_transform(s::NTuple{N, Vec{2, T}}) where {N, T <: FloatTypes} +@inline @generated function levin_transform(s::NTuple{N, T}, w::NTuple{N, T}) where {N, T <: FloatTypes} len = N - 1 :( begin - @nexprs $N i -> a_{i} = s[i] + @nexprs $N i -> a_{i} = Vec{2, T}((s[i] * w[i], w[i])) @nexprs $len k -> (@nexprs ($len-k) i -> a_{i} = fmadd(a_{i}, levin_scale(one(T), i, k-1), a_{i+1})) return (a_1[1] / a_1[2]) end ) end -# implementation for complex numbers -@inline @generated function levin_transform(s::NTuple{N, Vec{4, T}}) where {N, T <: FloatTypes} +@inline @generated function levin_transform(s::NTuple{N, Complex{T}}, w::NTuple{N, Complex{T}}) where {N, T <: FloatTypes} len = N - 1 :( begin - @nexprs $N i -> a_{i} = s[i] + @nexprs $N i -> a_{i} = Vec{4, T}((reim(s[i] * w[i])..., reim(w[i])...)) @nexprs $len k -> (@nexprs ($len-k) i -> a_{i} = fmadd(a_{i}, levin_scale(one(T), i, k-1), a_{i+1})) return (complex(a_1[1], a_1[2]) / complex(a_1[3], a_1[4])) end ) end -end \ No newline at end of file +end From ab502252e56452f866b43a3d0db54bf1b6e09449 Mon Sep 17 00:00:00 2001 From: Michael Helton Date: Thu, 27 Apr 2023 18:24:47 -0400 Subject: [PATCH 19/19] fix promotion --- src/AiryFunctions/cairy.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/AiryFunctions/cairy.jl b/src/AiryFunctions/cairy.jl index eaae0ac..ed81703 100644 --- a/src/AiryFunctions/cairy.jl +++ b/src/AiryFunctions/cairy.jl @@ -209,7 +209,7 @@ function _airybi(z::Complex{T}) where T <: Union{Float32, Float64} elseif angle(z) < 7pi/8 || x > -4.5 r = airybi_power_series(z)[1] else - r = cispi(-1/6) * _airyai(z * cispi(-2/3)) + cispi(1/6) * _airyai(z*cispi(2/3)) + r = cispi(-T(1/6)) * _airyai(z * cispi(-T(2/3))) + cispi(T(1/6)) * _airyai(z*cispi(T(2/3))) end return check_conj ? conj(r) : r @@ -254,7 +254,7 @@ function _airybiprime(z::Complex{T}) where T <: Union{Float32, Float64} elseif angle(z) < 7pi/8 || x > -4.5 r = airybi_power_series(z)[2] else - r = cispi(-5/6) * airyaiprime(z * cispi(-2/3)) + cispi(5/6) * airyaiprime(z*cispi(2/3)) + r = cispi(-T(5/6)) * airyaiprime(z * cispi(-T(2/3))) + cispi(T(5/6)) * airyaiprime(z*cispi(T(2/3))) end return check_conj ? conj(r) : r