diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 777f7ca..9463bdc 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -21,7 +21,7 @@ jobs:
       matrix:
         version:
           - '1'
-          - '1.6'
+          - '1.8'
           - 'nightly'
         os:
           - ubuntu-latest
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index 4f576c9..d6767e3 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -16,7 +16,7 @@ jobs:
       - uses: actions/checkout@v3
       - uses: julia-actions/setup-julia@v1
         with:
-          version: '1.6'
+          version: '1.8'
       - name: Install dependencies
         run: julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
       - name: Build and deploy
diff --git a/Project.toml b/Project.toml
index cda15b3..f9605f0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,9 +1,13 @@
 name = "Bessels"
 uuid = "0e736298-9ec6-45e8-9647-e4fc86a2fe38"
-version = "0.3-DEV"
+version = "0.3.0-DEV"
+
+[deps]
+SIMDMath = "5443be0b-e40a-4f70-a07e-dcd652efc383"
 
 [compat]
-julia = "1.6"
+julia = "1.8"
+SIMDMath = "0.2.1"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/src/Airy/cairy.jl b/src/Airy/cairy.jl
index 6dff2af..98d402b 100644
--- a/src/Airy/cairy.jl
+++ b/src/Airy/cairy.jl
@@ -59,7 +59,7 @@ function _airyai(z::Complex{T}) where T <: Union{Float32, Float64}
         end
     end
     x, y = real(z), imag(z)
-    airy_large_argument_cutoff(z) && return airyai_large_argument(z)
+    airy_large_argument_cutoff(z) && return airyai_large_args(z)[1]
     airyai_power_series_cutoff(x, y) && return airyai_power_series(z)
 
     if x > zero(T)
@@ -116,7 +116,7 @@ function _airyaiprime(z::ComplexOrReal{T}) where T <: Union{Float32, Float64}
         end
     end
     x, y = real(z), imag(z)
-    airy_large_argument_cutoff(z) && return airyaiprime_large_argument(z)
+    airy_large_argument_cutoff(z) && return airyai_large_args(z)[2]
     airyai_power_series_cutoff(x, y) && return airyaiprime_power_series(z)
 
     if x > zero(T)
@@ -173,7 +173,7 @@ function _airybi(z::ComplexOrReal{T}) where T <: Union{Float32, Float64}
         end
     end
     x, y = real(z), imag(z)
-    airy_large_argument_cutoff(z) && return airybi_large_argument(z)
+    airy_large_argument_cutoff(z) && return airybi_large_args(z)[1]
     airybi_power_series_cutoff(x, y) && return airybi_power_series(z)
 
     if x > zero(T)
@@ -232,7 +232,7 @@ function _airybiprime(z::ComplexOrReal{T}) where T <: Union{Float32, Float64}
         end
     end
     x, y = real(z), imag(z)
-    airy_large_argument_cutoff(z) && return airybiprime_large_argument(z)
+    airy_large_argument_cutoff(z) && return airybi_large_args(z)[2]
     airybi_power_series_cutoff(x, y) && return airybiprime_power_series(z)
 
     if x > zero(T)
@@ -377,197 +377,106 @@ end
 airybiprime_power_series(x::Float32) = Float32(airybiprime_power_series(Float64(x), tol=eps(Float32)))
 airybiprime_power_series(x::ComplexF32) = ComplexF32(airybiprime_power_series(ComplexF64(x), tol=eps(Float32)))
 
+# calculates besselk from the power series of besseli using the continued fraction and wronskian
+# this shift the order higher first to avoid cancellation in the power series of besseli along the imaginary axis
+# for real arguments this is not needed because besseli can be computed stably over the entire real axis
+function besselk_continued_fraction_shift(nu, x)
+    shift = 20
+    inu, inum1 = besseli_power_series_inu_inum1(nu + shift, x)
+    inu, inum1 = besselk_down_recurrence(x, inum1, inu, nu + shift - 1, nu)
+    H_knu = besselk_ratio_knu_knup1(nu-1, x)
+    return 1 / (x * (inum1 + inu / H_knu))
+end
+
 #####
 #####  Large argument expansion for airy functions
 #####
-airy_large_argument_cutoff(z::ComplexOrReal{Float64}) = abs(z) > 8
+airy_large_argument_cutoff(z::ComplexOrReal{Float64}) = abs(z) > 8.3
 airy_large_argument_cutoff(z::ComplexOrReal{Float32}) = abs(z) > 4
 
-function airyai_large_argument(x::Real)
-    x < zero(x) && return real(airyai_large_argument(complex(x)))
-    return airy_large_arg_a(abs(x))
-end
-
-function airyai_large_argument(z::Complex{T}) where T
-    x, y = real(z), imag(z)
-    a = airy_large_arg_a(z)
-    if x < zero(T) && abs(y) < 5.5
-        b = airy_large_arg_b(z)
-        y >= zero(T) ? (return a + im*b) : (return a - im*b)
-    end
-    return a
-end
-
-function airyaiprime_large_argument(x::Real)
-    x < zero(x) && return real(airyaiprime_large_argument(complex(x)))
-    return airy_large_arg_c(abs(x))
-end
-
-function airyaiprime_large_argument(z::Complex{T}) where T
-    x, y = real(z), imag(z)
-    c = airy_large_arg_c(z)
-    if x < zero(T) && abs(y) < 5.5
-        d = airy_large_arg_d(z)
-        y >= zero(T) ? (return c + im*d) : (return c - im*d)
-    end
-    return c
-end
-
-function airybi_large_argument(x::Real)
-    if x < zero(x)
-        return 2*real(airy_large_arg_b(complex(x)))
+# valid in 0 <= angle(z) <= pi
+# use conjugation for bottom half plane
+function airyai_large_args(z::Complex{T}) where T
+    if imag(z) < zero(T)
+        out = conj.(airyaix_large_args(conj(z)))
     else
-        return 2*(airy_large_arg_b(x))
+        out = airyaix_large_args(z)
     end
+    return out .* exp(-2/3 * z * sqrt(z))
 end
-
-function airybi_large_argument(z::Complex{T}) where T
-    x, y = real(z), imag(z)
-    b = airy_large_arg_b(z)
-    abs(y) <= 1.7*(x - 6) && return 2*b
-
-    check_conj = false
-    if y < zero(T)
-        z = conj(z)
-        b = conj(b)
-        y = abs(y)
-        check_conj = true
-    end
-
-    a = airy_large_arg_a(z)
-    if x < zero(T) && y < 5
-        out = b + im*a
-        check_conj && (out = conj(out))
-        return out
+function airybi_large_args(z::Complex{T}) where T
+    if imag(z) < zero(T)
+        out = conj.(airybix_large_args(conj(z)))
     else
-        out = 2*b + im*a
-        check_conj && (out = conj(out))
-        return out
+        out = airybix_large_args(z)
     end
+    return out .* exp(2/3 * z * sqrt(z))
 end
 
-function airybiprime_large_argument(x::Real)
-    if x < zero(x)
-        return 2*real(airy_large_arg_d(complex(x)))
+@inline function airyaix_large_args(z::Complex{T}) where T
+    xsqr = sqrt(z)
+    xsqrx =  Base.FastMath.inv_fast(z * xsqr)
+    A, B, C, D = compute_airy_asy_coef(z, xsqrx)
+    
+    if (real(z) < 0.0) && abs(imag(z)) < sqrt(3)*abs(real(z))
+        e = exp(4/3 * z * xsqr)
+        ai = muladd(B*im, e, A)
+        aip = muladd(-D*im, e, C)
     else
-        return 2*(airy_large_arg_d(x))
-    end
-end
-
-function airybiprime_large_argument(z::Complex{T}) where T
-    x, y = real(z), imag(z)
-    d = airy_large_arg_d(z)
-    abs(y) <= 1.7*(x - 6) && return 2*d
-
-    check_conj = false
-    if y < zero(T)
-        z = conj(z)
-        d = conj(d)
-        y = abs(y)
-        check_conj = true
+        ai = A
+        aip = C
     end
 
-    c = airy_large_arg_c(z)
-    if x < zero(T) && y < 5
-        out = d + im*c
-        check_conj && (out = conj(out))
-        return out
-    else
-        out = 2*d + im*c
-        check_conj && (out = conj(out))
-        return out
-    end
+    xsqr = sqrt(xsqr)
+    return ai * Base.FastMath.inv_fast(xsqr) * inv(PIPOW3O2(T)), aip * xsqr * inv(PIPOW3O2(T))
 end
 
-# see equations 24 and relations using eq 25 and 26 in [1]
-function airy_large_arg_a(x::ComplexOrReal{T}; tol=eps(T)*40) where T
-    S = eltype(x)
-    MaxIter = 3000
-    xsqr = sqrt(x)
-
-    out = zero(S)
-    t = GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4
-    a = 4*xsqr*x
-    for i in 0:MaxIter
-        out += t
-        abs(t) < tol*abs(out) && break
-        t *= -3*(i + one(T)/6) * (i + T(5)/6) / (a*(i + one(T)))
+@inline function airybix_large_args(z::Complex{T}) where T
+    xsqr = sqrt(z)
+    xsqrx = Base.FastMath.inv_fast(z * xsqr)
+    A, B, C, D = compute_airy_asy_coef(z, xsqrx)
+    
+    if (real(z) > 0.0) || abs(imag(z)) > sqrt(3)*abs(real(z))
+        B *= 2
+        D *= 2
     end
-    return out * exp(-a / 6) / (sqrt(T(π)^3) * sqrt(xsqr))
-end
 
-function airy_large_arg_b(x::ComplexOrReal{T}; tol=eps(T)*40) where T
-    S = eltype(x)
-    MaxIter = 3000
-    xsqr = sqrt(x)
+    e = exp(-4/3 * z * xsqr)
+    xsqr = sqrt(xsqr)
 
-    out = zero(S)
-    t = GAMMA_ONE_SIXTH(T) * GAMMA_FIVE_SIXTHS(T) / 4
-    a = 4*xsqr*x
-    for i in 0:MaxIter
-        out += t
-        abs(t) < tol*abs(out) && break
-        t *= 3*(i + one(T)/6) * (i + T(5)/6) / (a*(i + one(T)))
-    end
-    return out * exp(a / 6) / (sqrt(T(π)^3) * sqrt(xsqr))
+    bi = muladd(A*im, e, B) * Base.FastMath.inv_fast(xsqr) * inv(PIPOW3O2(T))
+    bip = muladd(C*im, e, -D) * xsqr * inv(PIPOW3O2(T))
+    return bi, bip
 end
 
-function airy_large_arg_c(x::ComplexOrReal{T}; tol=eps(T)*40) where T
-    S = eltype(x)
-    MaxIter = 3000
-    xsqr = sqrt(x)
+@inline function compute_airy_asy_coef(z, xsqrx)
+    invx3 = @fastmath inv(z^3)
+    p = SIMDMath.horner_simd(invx3, pack_AIRY_ASYM_COEF)
 
-    out = zero(S)
-    # use identities of gamma to relate to defined constants
-    # t = gamma(-one(T) / 6) * gamma(T(7) / 6) / 4
-    t = -GAMMA_FIVE_SIXTHS(T) * GAMMA_ONE_SIXTH(T) / 4
-    a = 4*xsqr*x
-    for i in 0:MaxIter
-        out += t
-        abs(t) < tol*abs(out) && break
-        t *= -3*(i - one(T)/6) * (i + T(7)/6) / (a*(i + one(T)))
-    end
-    return out * exp(-a / 6) * sqrt(xsqr) / sqrt(T(π)^3)
-end
+    pvec1 = SIMDMath.ComplexVec{2, Float64}((p.re[1], p.re[3]), (p.im[1], p.im[3]))
+    pvec2 = SIMDMath.ComplexVec{2, Float64}((p.re[2], p.re[4]), (p.im[2], p.im[4]))
 
-function airy_large_arg_d(x::ComplexOrReal{T}; tol=eps(T)*40) where T
-    S = eltype(x)
-    MaxIter = 3000
-    xsqr = sqrt(x)
+    zvec = SIMDMath.ComplexVec{2, Float64}((xsqrx.re, xsqrx.re), (xsqrx.im, xsqrx.im))
+    zvec = SIMDMath.fmul(zvec, pvec2)
+    a = SIMDMath.fadd(pvec1, zvec)
+    b = SIMDMath.fsub(pvec1, zvec)
 
-    out = zero(S)
-    # use identities of gamma to relate to defined constants
-    # t = gamma(-one(T) / 6) * gamma(T(7) / 6) / 4
-    t = -GAMMA_FIVE_SIXTHS(T) * GAMMA_ONE_SIXTH(T) / 4
-    a = 4*xsqr*x
-    for i in 0:MaxIter
-        out += t
-        abs(t) < tol*abs(out) && break
-        t *= 3*(i - one(T)/6) * (i + T(7)/6) / (a*(i + one(T)))
-    end
-    return -out * exp(a / 6) * sqrt(xsqr) / sqrt(T(π)^3)
+    A = complex(b.re[1].value, b.im[1].value)
+    B = complex(a.re[1].value, a.im[1].value)
+    C = complex(b.re[2].value, b.im[2].value)
+    D = complex(a.re[2].value, a.im[2].value)
+    return A, B, C, D
 end
 
-# negative arguments of airy functions oscillate around zero so as x -> -Inf it is more prone to cancellation
-# to give best accuracy it is best to promote to Float64 numbers until the Float32 tolerance
-airy_large_arg_a(x::Float32) = (airy_large_arg_a(Float64(x), tol=eps(Float32)))
-airy_large_arg_a(x::ComplexF32) = (airy_large_arg_a(ComplexF64(x), tol=eps(Float32)))
-
-airy_large_arg_b(x::Float32) = Float32(airy_large_arg_b(Float64(x), tol=eps(Float32)))
-airy_large_arg_b(x::ComplexF32) = ComplexF32(airy_large_arg_b(ComplexF64(x), tol=eps(Float32)))
+# to generate asymptotic expansions then split into even and odd coefficients
+# tuple(Float64.([3^k * gamma(k + 1//6) * gamma(k + 5//6) / (2^(2k+2) * gamma(k+1)) for k in big"0":big"24"])...)
+# tuple(Float64.([(3)^k * gamma(k - 1//6) * gamma(k + 7//6) / (2^(2k+2) * gamma(k+1)) for k in big"0":big"24"])...)
 
-airy_large_arg_c(x::Float32) = Float32(airy_large_arg_c(Float64(x), tol=eps(Float32)))
-airy_large_arg_c(x::ComplexF32) = ComplexF32(airy_large_arg_c(ComplexF64(x), tol=eps(Float32)))
+const AIRY_ASYM_COEF = (
+    (1.5707963267948966, 0.13124057851910487, 0.4584353787485384, 5.217255928936184, 123.97197893818594, 5038.313653002081, 312467.7049060495, 2.746439545069411e7, 3.2482560591146026e9, 4.97462635569055e11, 9.57732308323407e13, 2.2640712393216476e16, 6.447503420809101e18),
+    (0.1636246173744684, 0.20141783231057064, 1.3848568733028765, 23.555289417250567, 745.2667344964557, 37835.063701047824, 2.8147130917899106e6, 2.8856687720069575e8, 3.8998976239149216e10, 6.718472897263214e12, 1.4370735281142392e15, 3.7367429394637446e17, 1.1608192617215053e20),
+    (-1.5707963267948966, 0.15510250188621483, 0.4982993247266722, 5.515384839161109, 129.24738229725767, 5209.103946324185, 321269.61208650155, 2.812618811215662e7, 3.3166403972012258e9, 5.0676100258903735e11, 9.738286496397669e13, 2.298637212441062e16, 6.537678293827411e18),
+    (0.22907446432425577, 0.22511404787652015, 1.4803642438754887, 24.70432792540913, 773.390007496322, 38999.21950723391, 2.8878225227454924e6, 2.950515261265541e8, 3.97712331943799e10, 6.837383921993536e12, 1.460066704564067e15, 3.7912939312807334e17, 1.176400728321794e20)
+)
 
-airy_large_arg_d(x::Float32) = Float32(airy_large_arg_d(Float64(x), tol=eps(Float32)))
-airy_large_arg_d(x::ComplexF32) = ComplexF32(airy_large_arg_d(ComplexF64(x), tol=eps(Float32)))
-# calculates besselk from the power series of besseli using the continued fraction and wronskian
-# this shift the order higher first to avoid cancellation in the power series of besseli along the imaginary axis
-# for real arguments this is not needed because besseli can be computed stably over the entire real axis
-function besselk_continued_fraction_shift(nu, x)
-    shift = 20
-    inu, inum1 = besseli_power_series_inu_inum1(nu + shift, x)
-    inu, inum1 = besselk_down_recurrence(x, inum1, inu, nu + shift - 1, nu)
-    H_knu = besselk_ratio_knu_knup1(nu-1, x)
-    return 1 / (x * (inum1 + inu / H_knu))
-end
+const pack_AIRY_ASYM_COEF = SIMDMath.pack_poly(AIRY_ASYM_COEF)
diff --git a/src/Bessels.jl b/src/Bessels.jl
index 09c7fd2..110cc8e 100644
--- a/src/Bessels.jl
+++ b/src/Bessels.jl
@@ -1,5 +1,7 @@
 module Bessels
 
+using SIMDMath
+
 export besselj0
 export besselj1
 export besselj