From 07ae6ce8eb5832d2e906784465a5aa89aeb464b0 Mon Sep 17 00:00:00 2001
From: Michael Helton <heltonmc@protonmail.com>
Date: Sat, 4 Mar 2023 14:06:18 -0500
Subject: [PATCH 01/14] add simd evalpoly method

---
 src/simd.jl | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 src/simd.jl

diff --git a/src/simd.jl b/src/simd.jl
new file mode 100644
index 0000000..37cda7b
--- /dev/null
+++ b/src/simd.jl
@@ -0,0 +1,103 @@
+
+module HornerSIMD
+# inspiration from SIMD.jl and VectorizationBase.jl
+
+using Base: llvmcall, VecElement
+import Base: muladd
+
+export evalpoly_packed
+
+const VE = Base.VecElement
+const FloatTypes = Union{Float32, Float64}
+const LVec{N, FloatTypes} = NTuple{N, VE{FloatTypes}}
+const ScalarTypes = Union{VE{FloatTypes}, FloatTypes}
+const SIMDLanes = Union{LVec{2, Float64}, LVec{4, Float64}, LVec{8, Float64}}
+
+
+# create tuples of VecElements filled with a constant value of x
+
+@inline constantvector(x::Float64, y) = constantvector(VE(x), y)
+
+@inline function constantvector(x::VecElement{Float64}, y::Type{LVec{2, Float64}})
+    llvmcall("""%2 = insertelement <2 x double> undef, double %0, i32 0
+                %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
+                ret <2 x double> %3""",
+        LVec{2, Float64},
+        Tuple{VecElement{Float64}},
+        x)
+end
+@inline function constantvector(x::VecElement{Float64}, y::Type{LVec{4, Float64}})
+    llvmcall("""%2 = insertelement <4 x double> undef, double %0, i32 0
+                %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer
+                ret <4 x double> %3""",
+        LVec{4, Float64},
+        Tuple{VecElement{Float64}},
+        x)
+end
+@inline function constantvector(x::VecElement{Float64}, y::Type{LVec{8, Float64}})
+    llvmcall("""%2 = insertelement <8 x double> undef, double %0, i32 0
+                %3 = shufflevector <8 x double> %2, <8 x double> undef, <8 x i32> zeroinitializer
+                ret <8 x double> %3""",
+        LVec{8, Float64},
+        Tuple{VecElement{Float64}},
+        x)
+end
+
+# promotions for scalar variables to LVec
+#### need to think about the type being just a Float or a single VecElement will need to widen the Float Types to include a single VecElement
+
+@inline muladd(x::LVec{N, T}, y::LVec{N, T}, z::LVec{N, T}) where {N, T <: FloatTypes} = _muladd(x, y, z)
+@inline muladd(x::ScalarTypes, y::LVec{N, T}, z::LVec{N, T}) where {N, T <: FloatTypes} = muladd(constantvector(x, LVec{N, T}), y, z)
+@inline muladd(x::LVec{N, T}, y::ScalarTypes, z::LVec{N, T}) where {N, T <: FloatTypes} = muladd(x, constantvector(y, LVec{N, T}), z)
+@inline muladd(x::ScalarTypes, y::ScalarTypes, z::LVec{N, T}) where {N, T <: FloatTypes} = muladd(constantvector(x, LVec{N, T}), constantvector(y, LVec{N, T}), z)
+@inline muladd(x::LVec{N, T}, y::LVec{N, T}, z::ScalarTypes) where {N, T <: FloatTypes} = muladd(x, y, constantvector(z, LVec{N, T}))
+@inline muladd(x::ScalarTypes, y::LVec{N, T}, z::ScalarTypes) where {N, T <: FloatTypes} = muladd(constantvector(x, LVec{N, T}), y, constantvector(z, LVec{N, T}))
+@inline muladd(x::LVec{N, T}, y::ScalarTypes, z::ScalarTypes) where {N, T <: FloatTypes} = muladd(x, constantvector(y, LVec{N, T}), constantvector(z, LVec{N, T}))
+
+# muladd llvm instructions
+
+@inline function _muladd(x::LVec{2, Float64}, y::LVec{2, Float64}, z::LVec{2, Float64})
+    llvmcall("""%4 = fmul contract <2 x double> %0, %1
+                %5 = fadd contract <2 x double> %4, %2
+                ret <2 x double> %5""",
+        LVec{2, Float64},
+        Tuple{LVec{2, Float64}, LVec{2, Float64}, LVec{2, Float64}},
+        x,
+        y,
+        z)
+end
+
+@inline function _muladd(x::LVec{4, Float64}, y::LVec{4, Float64}, z::LVec{4, Float64})
+    llvmcall("""%4 = fmul contract <4 x double> %0, %1
+                %5 = fadd contract <4 x double> %4, %2
+                ret <4 x double> %5""",
+        LVec{4, Float64},
+        Tuple{LVec{4, Float64}, LVec{4, Float64}, LVec{4, Float64}},
+        x,
+        y,
+        z)
+end
+
+@inline function _muladd(x::LVec{8, Float64}, y::LVec{8, Float64}, z::LVec{8, Float64})
+    llvmcall("""%4 = fmul contract <8 x double> %0, %1
+                %5 = fadd contract <8 x double> %4, %2
+                ret <8 x double> %5""",
+        LVec{8, Float64},
+        Tuple{LVec{8, Float64}, LVec{8, Float64}, LVec{8, Float64}},
+        x,
+        y,
+        z)
+end
+
+evalpoly_packed(x::Union{T, VE{T}}, p::NTuple{N, LVec{M, T}}) where {N, M, T} = evalpoly_packed(constantvector(x, LVec{M, T}), p)
+
+# the type needs to only accept a custom type for LVec{2, Float64}, LVec{4, Float64}....
+@inline function evalpoly_packed(x::LVec{M, Float64}, p::NTuple{N, LVec{M, Float64}}) where {N, M}
+    a = p[end]
+    @inbounds for i in N-1:-1:1
+        a = muladd(x, a, p[i])
+    end
+    return a
+end
+
+end

From 66f8f5f80511b5ae303eac1ed5f03d439f1652cd Mon Sep 17 00:00:00 2001
From: Michael Helton <heltonmc@protonmail.com>
Date: Tue, 7 Mar 2023 19:04:04 -0500
Subject: [PATCH 02/14] update structure and add 2, 4, 8th order horner

---
 src/SIMDMath/SIMDMath.jl      |  30 ++++++++++
 src/SIMDMath/arithmetic.jl    |  44 +++++++++++++++
 src/SIMDMath/horner.jl        |  85 ++++++++++++++++++++++++++++
 src/SIMDMath/shufflevector.jl |  28 +++++++++
 src/simd.jl                   | 103 ----------------------------------
 5 files changed, 187 insertions(+), 103 deletions(-)
 create mode 100644 src/SIMDMath/SIMDMath.jl
 create mode 100644 src/SIMDMath/arithmetic.jl
 create mode 100644 src/SIMDMath/horner.jl
 create mode 100644 src/SIMDMath/shufflevector.jl
 delete mode 100644 src/simd.jl

diff --git a/src/SIMDMath/SIMDMath.jl b/src/SIMDMath/SIMDMath.jl
new file mode 100644
index 0000000..30a3354
--- /dev/null
+++ b/src/SIMDMath/SIMDMath.jl
@@ -0,0 +1,30 @@
+#
+# SIMDMath aims to provide vectorized basic math operations to be used in static fully unrolled functions such as computing special functions.
+#
+# The type system is heavily influenced by SIMD.jl (https://github.com/eschnett/SIMD.jl) licensed under the Simplified "2-clause" BSD License:
+# Copyright (c) 2016-2020: Erik Schnetter, Kristoffer Carlsson, Julia Computing All rights reserved.
+#
+# This module is also influenced by VectorizationBase.jl (https://github.com/JuliaSIMD/VectorizationBase.jl) licensed under the MIT License: Copyright (c) 2018 Chris Elrod
+#
+
+module SIMDMath
+
+
+using Base: llvmcall, VecElement
+
+
+export evalpoly_packed
+
+const VE = Base.VecElement
+const FloatTypes = Union{Float32, Float64}
+const LVec{N, FloatTypes} = NTuple{N, VE{FloatTypes}}
+
+## maybe have a complex type stored as a pack or real and complex values separatley
+const CVec{N, FloatTypes} = NTuple{2, LVec{N, FloatTypes}}
+# CVec{2, Float64}((LVec{2, Float64}((1.1, 1.2)), LVec{2, Float64}((1.5, 1.0))))
+
+const ScalarTypes = Union{VE{FloatTypes}, FloatTypes}
+const SIMDLanes = Union{LVec{2, Float64}, LVec{4, Float64}, LVec{8, Float64}}
+
+end
+
diff --git a/src/SIMDMath/arithmetic.jl b/src/SIMDMath/arithmetic.jl
new file mode 100644
index 0000000..c844435
--- /dev/null
+++ b/src/SIMDMath/arithmetic.jl
@@ -0,0 +1,44 @@
+import Base: muladd
+
+@inline muladd(x::LVec{N, T}, y::LVec{N, T}, z::LVec{N, T}) where {N, T <: FloatTypes} = _muladd(x, y, z)
+@inline muladd(x::ScalarTypes, y::LVec{N, T}, z::LVec{N, T}) where {N, T <: FloatTypes} = muladd(constantvector(x, LVec{N, T}), y, z)
+@inline muladd(x::LVec{N, T}, y::ScalarTypes, z::LVec{N, T}) where {N, T <: FloatTypes} = muladd(x, constantvector(y, LVec{N, T}), z)
+@inline muladd(x::ScalarTypes, y::ScalarTypes, z::LVec{N, T}) where {N, T <: FloatTypes} = muladd(constantvector(x, LVec{N, T}), constantvector(y, LVec{N, T}), z)
+@inline muladd(x::LVec{N, T}, y::LVec{N, T}, z::ScalarTypes) where {N, T <: FloatTypes} = muladd(x, y, constantvector(z, LVec{N, T}))
+@inline muladd(x::ScalarTypes, y::LVec{N, T}, z::ScalarTypes) where {N, T <: FloatTypes} = muladd(constantvector(x, LVec{N, T}), y, constantvector(z, LVec{N, T}))
+@inline muladd(x::LVec{N, T}, y::ScalarTypes, z::ScalarTypes) where {N, T <: FloatTypes} = muladd(x, constantvector(y, LVec{N, T}), constantvector(z, LVec{N, T}))
+
+# muladd llvm instructions
+
+@inline function _muladd(x::LVec{2, Float64}, y::LVec{2, Float64}, z::LVec{2, Float64})
+    llvmcall("""%4 = fmul contract <2 x double> %0, %1
+                %5 = fadd contract <2 x double> %4, %2
+                ret <2 x double> %5""",
+        LVec{2, Float64},
+        Tuple{LVec{2, Float64}, LVec{2, Float64}, LVec{2, Float64}},
+        x,
+        y,
+        z)
+end
+
+@inline function _muladd(x::LVec{4, Float64}, y::LVec{4, Float64}, z::LVec{4, Float64})
+    llvmcall("""%4 = fmul contract <4 x double> %0, %1
+                %5 = fadd contract <4 x double> %4, %2
+                ret <4 x double> %5""",
+        LVec{4, Float64},
+        Tuple{LVec{4, Float64}, LVec{4, Float64}, LVec{4, Float64}},
+        x,
+        y,
+        z)
+end
+
+@inline function _muladd(x::LVec{8, Float64}, y::LVec{8, Float64}, z::LVec{8, Float64})
+    llvmcall("""%4 = fmul contract <8 x double> %0, %1
+                %5 = fadd contract <8 x double> %4, %2
+                ret <8 x double> %5""",
+        LVec{8, Float64},
+        Tuple{LVec{8, Float64}, LVec{8, Float64}, LVec{8, Float64}},
+        x,
+        y,
+        z)
+end
\ No newline at end of file
diff --git a/src/SIMDMath/horner.jl b/src/SIMDMath/horner.jl
new file mode 100644
index 0000000..dcd34d7
--- /dev/null
+++ b/src/SIMDMath/horner.jl
@@ -0,0 +1,85 @@
+
+# promotions for scalar variables to LVec
+#### need to think about the type being just a Float or a single VecElement will need to widen the Float Types to include a single VecElement
+
+evalpoly_packed(x::Union{T, VE{T}}, p::NTuple{N, LVec{M, T}}) where {N, M, T} = evalpoly_packed(constantvector(x, LVec{M, T}), p)
+
+# the type needs to only accept a custom type for LVec{2, Float64}, LVec{4, Float64}....
+@inline function evalpoly_packed(x::LVec{M, Float64}, p::NTuple{N, LVec{M, Float64}}) where {N, M}
+    a = p[end]
+    @inbounds for i in N-1:-1:1
+        a = muladd(x, a, p[i])
+    end
+    return a
+end
+
+@inline pack_poly(p1::NTuple{N, T}, p2::NTuple{N, T}, p3::NTuple{N, T}, p4::NTuple{N, T}) where {N, T <: Float64} = ntuple(i -> LVec{4, T}((p1[i], p2[i], p3[i], p4[i])), Val(N))
+
+
+
+# performs a second order horner scheme that splits polynomial evaluation into even/odd powers
+@inline horner2(x, P::NTuple{N, Float64}) where N = horner2(x, pack_poly2(P))
+
+@inline function horner2(x, P::NTuple{N, LVec{2, Float64}}) where N
+    a = evalpoly_packed(x * x, P)
+    return muladd(x, a[2].value, a[1].value)
+end
+
+# do we try to vectorize the final line as the two muladds can be done in parallel
+# they are in reasonable order it would be about splitting the a1, a2, a3, a4 into a two by two 
+# we can then do the same for the 8th order scheme
+## 4th order horner scheme that splits into a + ex^4 + ... & b + fx^5 ... & etc
+@inline horner4(x, P::NTuple{N, Float64}) where N = horner4(x, pack_poly4(P))
+
+@inline function horner4(x, P::NTuple{N, LVec{4, Float64}}) where N
+    xx = x * x
+    a = evalpoly_packed(xx * xx, P)
+    b = muladd(x, LVec{2, Float64}((a[4].value, a[2].value)), LVec{2, Float64}((a[3].value, a[1].value)))
+    return muladd(xx, b[1].value, b[2].value) 
+end
+
+# 8th order horner scheme that splits into a + ix^8 + .... & etc
+
+@inline horner8(x, P::NTuple{N, Float64}) where N = horner8(x, pack_poly8(P))
+
+@inline function horner8(x, P::NTuple{N, LVec{8, Float64}}) where N
+    x2 = x * x
+    x4 = x2 * x2
+    a = evalpoly_packed(x4 * x4, P)
+
+    # following computes
+    # a[1].value + a[2].value*x + a[3].value*x^2 + a[4].value*x^3 + a[5].value*x^4 + a[6].value*x^5 + a[7].value*x^6 + a[8].value*x^7
+
+    b = muladd(x, LVec{4, Float64}((a[4].value, a[2].value, a[8].value, a[6].value)), LVec{4, Float64}((a[3].value, a[1].value, a[7].value, a[5].value)))
+    c = muladd(x2, LVec{2, Float64}((b[1].value, b[3].value)), LVec{2, Float64}((b[2].value, b[4].value)))
+    return muladd(x4, c[2].value, c[1].value)
+end
+
+@inline function pack_poly2(p::NTuple{N, T}) where {N, T <: Float64}
+    isone(N) && return LVec{2, T}((p[1], 0.0))
+    if iseven(N)
+        return ntuple(i -> LVec{2, T}((p[2*i - 1], p[2i])), Val(N ÷ 2))
+    else
+        return ntuple(i -> (isequal(i, (N + 1) ÷ 2) ? (LVec{2, T}((p[2*i - 1], 0.0))) :  LVec{2, T}((p[2*i - 1], p[2i]))), Val((N + 1) ÷ 2))
+    end
+end
+
+@inline function pack_poly4(p::NTuple{N, T}) where {N, T <: Float64}
+    rem = N % 4
+    if !iszero(rem)
+        P = (p..., ntuple(i -> 0.0, Val(4 - rem))...)
+        return ntuple(i -> LVec{4, T}((P[4*i - 3], P[4i - 2], P[4i - 1], P[4i])), Val(length(P) ÷ 4))
+    else
+        return ntuple(i -> LVec{4, T}((p[4*i - 3], p[4i - 2], p[4i - 1], p[4i])), Val(N ÷ 4))
+    end
+end
+
+@inline function pack_poly8(p::NTuple{N, T}) where {N, T <: Float64}
+    rem = N % 8
+    if !iszero(rem)
+        P = (p..., ntuple(i -> 0.0, Val(8 - rem))...)
+        return ntuple(i -> LVec{8, T}((P[8i - 7], P[8i - 6], P[8i - 5], P[8i - 4], P[8i - 3], P[8i - 2], P[8i - 1], P[8i])), Val(length(P) ÷ 8))
+    else
+        return ntuple(i -> LVec{8, T}((p[8i - 7], p[8i - 6], p[8i - 5], p[8i - 4], p[8i - 3], p[8i - 2], p[8i - 1], p[8i])), Val(N ÷ 8))
+    end
+end
diff --git a/src/SIMDMath/shufflevector.jl b/src/SIMDMath/shufflevector.jl
new file mode 100644
index 0000000..3c070ba
--- /dev/null
+++ b/src/SIMDMath/shufflevector.jl
@@ -0,0 +1,28 @@
+# create tuples of VecElements filled with a constant value of x
+
+@inline constantvector(x::Float64, y) = constantvector(VE(x), y)
+
+@inline function constantvector(x::VecElement{Float64}, y::Type{LVec{2, Float64}})
+    llvmcall("""%2 = insertelement <2 x double> undef, double %0, i32 0
+                %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
+                ret <2 x double> %3""",
+        LVec{2, Float64},
+        Tuple{VecElement{Float64}},
+        x)
+end
+@inline function constantvector(x::VecElement{Float64}, y::Type{LVec{4, Float64}})
+    llvmcall("""%2 = insertelement <4 x double> undef, double %0, i32 0
+                %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer
+                ret <4 x double> %3""",
+        LVec{4, Float64},
+        Tuple{VecElement{Float64}},
+        x)
+end
+@inline function constantvector(x::VecElement{Float64}, y::Type{LVec{8, Float64}})
+    llvmcall("""%2 = insertelement <8 x double> undef, double %0, i32 0
+                %3 = shufflevector <8 x double> %2, <8 x double> undef, <8 x i32> zeroinitializer
+                ret <8 x double> %3""",
+        LVec{8, Float64},
+        Tuple{VecElement{Float64}},
+        x)
+end
diff --git a/src/simd.jl b/src/simd.jl
deleted file mode 100644
index 37cda7b..0000000
--- a/src/simd.jl
+++ /dev/null
@@ -1,103 +0,0 @@
-
-module HornerSIMD
-# inspiration from SIMD.jl and VectorizationBase.jl
-
-using Base: llvmcall, VecElement
-import Base: muladd
-
-export evalpoly_packed
-
-const VE = Base.VecElement
-const FloatTypes = Union{Float32, Float64}
-const LVec{N, FloatTypes} = NTuple{N, VE{FloatTypes}}
-const ScalarTypes = Union{VE{FloatTypes}, FloatTypes}
-const SIMDLanes = Union{LVec{2, Float64}, LVec{4, Float64}, LVec{8, Float64}}
-
-
-# create tuples of VecElements filled with a constant value of x
-
-@inline constantvector(x::Float64, y) = constantvector(VE(x), y)
-
-@inline function constantvector(x::VecElement{Float64}, y::Type{LVec{2, Float64}})
-    llvmcall("""%2 = insertelement <2 x double> undef, double %0, i32 0
-                %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
-                ret <2 x double> %3""",
-        LVec{2, Float64},
-        Tuple{VecElement{Float64}},
-        x)
-end
-@inline function constantvector(x::VecElement{Float64}, y::Type{LVec{4, Float64}})
-    llvmcall("""%2 = insertelement <4 x double> undef, double %0, i32 0
-                %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer
-                ret <4 x double> %3""",
-        LVec{4, Float64},
-        Tuple{VecElement{Float64}},
-        x)
-end
-@inline function constantvector(x::VecElement{Float64}, y::Type{LVec{8, Float64}})
-    llvmcall("""%2 = insertelement <8 x double> undef, double %0, i32 0
-                %3 = shufflevector <8 x double> %2, <8 x double> undef, <8 x i32> zeroinitializer
-                ret <8 x double> %3""",
-        LVec{8, Float64},
-        Tuple{VecElement{Float64}},
-        x)
-end
-
-# promotions for scalar variables to LVec
-#### need to think about the type being just a Float or a single VecElement will need to widen the Float Types to include a single VecElement
-
-@inline muladd(x::LVec{N, T}, y::LVec{N, T}, z::LVec{N, T}) where {N, T <: FloatTypes} = _muladd(x, y, z)
-@inline muladd(x::ScalarTypes, y::LVec{N, T}, z::LVec{N, T}) where {N, T <: FloatTypes} = muladd(constantvector(x, LVec{N, T}), y, z)
-@inline muladd(x::LVec{N, T}, y::ScalarTypes, z::LVec{N, T}) where {N, T <: FloatTypes} = muladd(x, constantvector(y, LVec{N, T}), z)
-@inline muladd(x::ScalarTypes, y::ScalarTypes, z::LVec{N, T}) where {N, T <: FloatTypes} = muladd(constantvector(x, LVec{N, T}), constantvector(y, LVec{N, T}), z)
-@inline muladd(x::LVec{N, T}, y::LVec{N, T}, z::ScalarTypes) where {N, T <: FloatTypes} = muladd(x, y, constantvector(z, LVec{N, T}))
-@inline muladd(x::ScalarTypes, y::LVec{N, T}, z::ScalarTypes) where {N, T <: FloatTypes} = muladd(constantvector(x, LVec{N, T}), y, constantvector(z, LVec{N, T}))
-@inline muladd(x::LVec{N, T}, y::ScalarTypes, z::ScalarTypes) where {N, T <: FloatTypes} = muladd(x, constantvector(y, LVec{N, T}), constantvector(z, LVec{N, T}))
-
-# muladd llvm instructions
-
-@inline function _muladd(x::LVec{2, Float64}, y::LVec{2, Float64}, z::LVec{2, Float64})
-    llvmcall("""%4 = fmul contract <2 x double> %0, %1
-                %5 = fadd contract <2 x double> %4, %2
-                ret <2 x double> %5""",
-        LVec{2, Float64},
-        Tuple{LVec{2, Float64}, LVec{2, Float64}, LVec{2, Float64}},
-        x,
-        y,
-        z)
-end
-
-@inline function _muladd(x::LVec{4, Float64}, y::LVec{4, Float64}, z::LVec{4, Float64})
-    llvmcall("""%4 = fmul contract <4 x double> %0, %1
-                %5 = fadd contract <4 x double> %4, %2
-                ret <4 x double> %5""",
-        LVec{4, Float64},
-        Tuple{LVec{4, Float64}, LVec{4, Float64}, LVec{4, Float64}},
-        x,
-        y,
-        z)
-end
-
-@inline function _muladd(x::LVec{8, Float64}, y::LVec{8, Float64}, z::LVec{8, Float64})
-    llvmcall("""%4 = fmul contract <8 x double> %0, %1
-                %5 = fadd contract <8 x double> %4, %2
-                ret <8 x double> %5""",
-        LVec{8, Float64},
-        Tuple{LVec{8, Float64}, LVec{8, Float64}, LVec{8, Float64}},
-        x,
-        y,
-        z)
-end
-
-evalpoly_packed(x::Union{T, VE{T}}, p::NTuple{N, LVec{M, T}}) where {N, M, T} = evalpoly_packed(constantvector(x, LVec{M, T}), p)
-
-# the type needs to only accept a custom type for LVec{2, Float64}, LVec{4, Float64}....
-@inline function evalpoly_packed(x::LVec{M, Float64}, p::NTuple{N, LVec{M, Float64}}) where {N, M}
-    a = p[end]
-    @inbounds for i in N-1:-1:1
-        a = muladd(x, a, p[i])
-    end
-    return a
-end
-
-end

From 4f2e9781d3d9a8aa518aeead0b2b908847d9fe80 Mon Sep 17 00:00:00 2001
From: Michael Helton <heltonmc@protonmail.com>
Date: Wed, 8 Mar 2023 11:20:01 -0500
Subject: [PATCH 03/14] clean up naming

---
 src/Bessels.jl           |   2 +
 src/SIMDMath/SIMDMath.jl |  13 ++--
 src/SIMDMath/horner.jl   | 137 +++++++++++++++++++++++++--------------
 3 files changed, 99 insertions(+), 53 deletions(-)

diff --git a/src/Bessels.jl b/src/Bessels.jl
index 63e7536..689878a 100644
--- a/src/Bessels.jl
+++ b/src/Bessels.jl
@@ -60,6 +60,8 @@ include("Polynomials/besselj_polys.jl")
 include("asymptotics.jl")
 include("gamma.jl")
 
+include("SIMDMath/SIMDMath.jl")
+
 precompile(besselj, (Float64, Float64))
 
 end
diff --git a/src/SIMDMath/SIMDMath.jl b/src/SIMDMath/SIMDMath.jl
index 30a3354..9526dc0 100644
--- a/src/SIMDMath/SIMDMath.jl
+++ b/src/SIMDMath/SIMDMath.jl
@@ -9,22 +9,25 @@
 
 module SIMDMath
 
-
 using Base: llvmcall, VecElement
 
-
-export evalpoly_packed
+export horner_simd, pack_horner
+export horner, horner2, horner4, horner8
+export pack_horner2, pack_horner4, pack_horner8
 
 const VE = Base.VecElement
 const FloatTypes = Union{Float32, Float64}
 const LVec{N, FloatTypes} = NTuple{N, VE{FloatTypes}}
 
-## maybe have a complex type stored as a pack or real and complex values separatley
+## maybe have a complex type stored as a pack or real and complex values separately
 const CVec{N, FloatTypes} = NTuple{2, LVec{N, FloatTypes}}
 # CVec{2, Float64}((LVec{2, Float64}((1.1, 1.2)), LVec{2, Float64}((1.5, 1.0))))
 
 const ScalarTypes = Union{VE{FloatTypes}, FloatTypes}
 const SIMDLanes = Union{LVec{2, Float64}, LVec{4, Float64}, LVec{8, Float64}}
 
-end
+include("arithmetic.jl")
+include("shufflevector.jl")
+include("horner.jl")
 
+end
diff --git a/src/SIMDMath/horner.jl b/src/SIMDMath/horner.jl
index dcd34d7..5039193 100644
--- a/src/SIMDMath/horner.jl
+++ b/src/SIMDMath/horner.jl
@@ -1,11 +1,30 @@
-
-# promotions for scalar variables to LVec
-#### need to think about the type being just a Float or a single VecElement will need to widen the Float Types to include a single VecElement
-
-evalpoly_packed(x::Union{T, VE{T}}, p::NTuple{N, LVec{M, T}}) where {N, M, T} = evalpoly_packed(constantvector(x, LVec{M, T}), p)
-
-# the type needs to only accept a custom type for LVec{2, Float64}, LVec{4, Float64}....
-@inline function evalpoly_packed(x::LVec{M, Float64}, p::NTuple{N, LVec{M, Float64}}) where {N, M}
+#
+# Polynomial evaluation using Horner's scheme.
+#
+# File contains methods to compute several different polynomials at once using SIMD with Horner's scheme (horner_simd).
+# Additional methods to evaluate a single polynomial using SIMD with different order Horner's scheme (horner2, horner4, horner8).
+# TODO: Extend to Float32
+# TODO: Support complex numbers
+
+#
+# Parallel Horner's scheme to evaluate 2, 4, or 8 polynomials in parallel using SIMD.
+#
+# Usage:
+# x = 1.1
+# P1 = (1.1, 1.2, 1.4, 1.5)
+# P2 = (1.3, 1.3, 1.6, 1.8)
+# a = horner_simd(x, pack_horner(P1, P2))
+# a[1].value == evalpoly(x, P1)
+# a[2].value == evalpoly(x, P2)
+#
+# Note the strict equality as this method doesn't alter the order of individual polynomial evaluations
+#
+
+# cast single element `x` to a width of the number of polynomials
+horner_simd(x::Union{T, VE{T}}, p::NTuple{N, LVec{M, T}}) where {N, M, T} = horner_simd(constantvector(x, LVec{M, T}), p)
+
+# In theory, this can accept any number of packed polynomials though for SIMD this really should only accept `M = 2, 4, 8` and for Float32 `M = 16`
+@inline function horner_simd(x::LVec{M, Float64}, p::NTuple{N, LVec{M, Float64}}) where {N, M}
     a = p[end]
     @inbounds for i in N-1:-1:1
         a = muladd(x, a, p[i])
@@ -13,39 +32,51 @@ evalpoly_packed(x::Union{T, VE{T}}, p::NTuple{N, LVec{M, T}}) where {N, M, T} =
     return a
 end
 
-@inline pack_poly(p1::NTuple{N, T}, p2::NTuple{N, T}, p3::NTuple{N, T}, p4::NTuple{N, T}) where {N, T <: Float64} = ntuple(i -> LVec{4, T}((p1[i], p2[i], p3[i], p4[i])), Val(N))
-
-
-
-# performs a second order horner scheme that splits polynomial evaluation into even/odd powers
-@inline horner2(x, P::NTuple{N, Float64}) where N = horner2(x, pack_poly2(P))
+@inline pack_horner(p1::NTuple{N, T}, p2::NTuple{N, T}) where {N, T <: Float64} = ntuple(i -> LVec{2, T}((p1[i], p2[i])), Val(N))
+@inline pack_horner(p1::NTuple{N, T}, p2::NTuple{N, T}, p3::NTuple{N, T}, p4::NTuple{N, T}) where {N, T <: Float64} = ntuple(i -> LVec{4, T}((p1[i], p2[i], p3[i], p4[i])), Val(N))
+@inline pack_horner(p1::NTuple{N, T}, p2::NTuple{N, T}, p3::NTuple{N, T}, p4::NTuple{N, T}, p5::NTuple{N, T}, p6::NTuple{N, T}, p7::NTuple{N, T}, p8::NTuple{N, T}) where {N, T <: Float64} = ntuple(i -> LVec{8, T}((p1[i], p2[i], p3[i], p4[i], p5[i], p6[i], p7[i], p8[i])), Val(N))
+
+#
+# Horner scheme for evaluating single polynomials
+#
+# In some cases we are interested in speeding up the evaluation of a single polynomial of high degree.
+# It is possible to split the polynomial into pieces and use SIMD to convert each piece simultaneously
+#
+# Usage:
+# x = 1.1
+# poly = (1.0, 0.5, 0.1, 0.05, 0.1)
+# evalpoly(x, poly) ≈ horner2(x, pack_horner2(poly)) ≈ horner4(x, pack_horner4(poly)) ≈ horner8(x, pack_horner8(poly))
+#
+# Note the approximative relation between each as floating point arithmetic is associative.
+# Here, we are adding up the polynomial in different degrees so there will be a few ULPs difference
+# 
+
+# horner - default regular horner to base evalpoly
+# horner2 - 2nd horner scheme that splits polynomial into even/odd powers
+# horner4 - 4th order horner scheme that splits into a + ex^4 + ... & b + fx^5 ... & etc
+# horner8 - 8th order horner scheme that splits into a + ix^8 + .... & etc
+
+@inline horner(x::T, P::NTuple{N, T}) where {N, T <: FloatTypes} = evalpoly(x, P)
+@inline horner2(x, P::NTuple{N, Float64}) where N = horner2(x, pack_horner2(P))
+@inline horner4(x, P::NTuple{N, Float64}) where N = horner4(x, pack_horner4(P))
+@inline horner8(x, P::NTuple{N, Float64}) where N = horner8(x, pack_horner8(P))
 
 @inline function horner2(x, P::NTuple{N, LVec{2, Float64}}) where N
-    a = evalpoly_packed(x * x, P)
+    a = horner_simd(x * x, P)
     return muladd(x, a[2].value, a[1].value)
 end
 
-# do we try to vectorize the final line as the two muladds can be done in parallel
-# they are in reasonable order it would be about splitting the a1, a2, a3, a4 into a two by two 
-# we can then do the same for the 8th order scheme
-## 4th order horner scheme that splits into a + ex^4 + ... & b + fx^5 ... & etc
-@inline horner4(x, P::NTuple{N, Float64}) where N = horner4(x, pack_poly4(P))
-
 @inline function horner4(x, P::NTuple{N, LVec{4, Float64}}) where N
     xx = x * x
-    a = evalpoly_packed(xx * xx, P)
+    a = horner_simd(xx * xx, P)
     b = muladd(x, LVec{2, Float64}((a[4].value, a[2].value)), LVec{2, Float64}((a[3].value, a[1].value)))
     return muladd(xx, b[1].value, b[2].value) 
 end
 
-# 8th order horner scheme that splits into a + ix^8 + .... & etc
-
-@inline horner8(x, P::NTuple{N, Float64}) where N = horner8(x, pack_poly8(P))
-
 @inline function horner8(x, P::NTuple{N, LVec{8, Float64}}) where N
     x2 = x * x
     x4 = x2 * x2
-    a = evalpoly_packed(x4 * x4, P)
+    a = horner_simd(x4 * x4, P)
 
     # following computes
     # a[1].value + a[2].value*x + a[3].value*x^2 + a[4].value*x^3 + a[5].value*x^4 + a[6].value*x^5 + a[7].value*x^6 + a[8].value*x^7
@@ -55,31 +86,41 @@ end
     return muladd(x4, c[2].value, c[1].value)
 end
 
-@inline function pack_poly2(p::NTuple{N, T}) where {N, T <: Float64}
-    isone(N) && return LVec{2, T}((p[1], 0.0))
-    if iseven(N)
-        return ntuple(i -> LVec{2, T}((p[2*i - 1], p[2i])), Val(N ÷ 2))
-    else
-        return ntuple(i -> (isequal(i, (N + 1) ÷ 2) ? (LVec{2, T}((p[2*i - 1], 0.0))) :  LVec{2, T}((p[2*i - 1], p[2i]))), Val((N + 1) ÷ 2))
-    end
+#
+# Following packs coefficients for second, fourth, and eighth order horner evaluations.
+# Accepts a single polynomial that will pack into proper coefficients
+#
+# Usage:
+# x = 1.1
+# poly = (1.0, 0.5, 0.1, 0.05, 0.1)
+# evalpoly(x, poly) ≈ horner2(x, pack_horner2(poly)) ≈ horner4(x, pack_horner4(poly)) ≈ horner8(x, pack_horner8(poly))
+#
+# Note: these functions will pack statically known polynomials for N <= 32 at compile time.
+# If length(poly) > 32 they must be packed and declared as constants otherwise packing will allocate at runtime
+# Example:
+# const P = pack_horner2(ntuple(i -> i*0.1, 35))
+# horner2(x, P)
+#
+# TODO: Use a generated function instead might make the packing more reliable for all polynomial degrees
+#
+
+@inline function pack_horner2(p::NTuple{N, T}) where {N, T <: Float64}
+    rem = N % 2
+    pad = !iszero(rem) ? (2 - rem) : 0
+    P = (p..., ntuple(i -> 0.0, Val(pad))...)
+    return ntuple(i -> LVec{2, T}((P[2*i - 1], P[2i])), Val((N + pad) ÷ 2))
 end
 
-@inline function pack_poly4(p::NTuple{N, T}) where {N, T <: Float64}
+@inline function pack_horner4(p::NTuple{N, T}) where {N, T <: Float64}
     rem = N % 4
-    if !iszero(rem)
-        P = (p..., ntuple(i -> 0.0, Val(4 - rem))...)
-        return ntuple(i -> LVec{4, T}((P[4*i - 3], P[4i - 2], P[4i - 1], P[4i])), Val(length(P) ÷ 4))
-    else
-        return ntuple(i -> LVec{4, T}((p[4*i - 3], p[4i - 2], p[4i - 1], p[4i])), Val(N ÷ 4))
-    end
+    pad = !iszero(rem) ? (4 - rem) : 0
+    P = (p..., ntuple(i -> 0.0, Val(pad))...)
+    return ntuple(i -> LVec{4, T}((P[4*i - 3], P[4i - 2], P[4i - 1], P[4i])), Val((N + pad) ÷ 4))
 end
 
-@inline function pack_poly8(p::NTuple{N, T}) where {N, T <: Float64}
+@inline function pack_horner8(p::NTuple{N, T}) where {N, T <: Float64}
     rem = N % 8
-    if !iszero(rem)
-        P = (p..., ntuple(i -> 0.0, Val(8 - rem))...)
-        return ntuple(i -> LVec{8, T}((P[8i - 7], P[8i - 6], P[8i - 5], P[8i - 4], P[8i - 3], P[8i - 2], P[8i - 1], P[8i])), Val(length(P) ÷ 8))
-    else
-        return ntuple(i -> LVec{8, T}((p[8i - 7], p[8i - 6], p[8i - 5], p[8i - 4], p[8i - 3], p[8i - 2], p[8i - 1], p[8i])), Val(N ÷ 8))
-    end
+    pad = !iszero(rem) ? (8 - rem) : 0
+    P = (p..., ntuple(i -> 0.0, Val(pad))...)
+    return ntuple(i -> LVec{8, T}((P[8i - 7], P[8i - 6], P[8i - 5], P[8i - 4], P[8i - 3], P[8i - 2], P[8i - 1], P[8i])), Val((N + pad) ÷ 8))
 end

From 4c9b8bf3eb4d0f982ddfa08822cd7aa2701b03e3 Mon Sep 17 00:00:00 2001
From: Michael Helton <heltonmc@protonmail.com>
Date: Wed, 8 Mar 2023 11:40:43 -0500
Subject: [PATCH 04/14] Create readme.md

---
 src/SIMDMath/readme.md | 60 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 src/SIMDMath/readme.md

diff --git a/src/SIMDMath/readme.md b/src/SIMDMath/readme.md
new file mode 100644
index 0000000..6f781cc
--- /dev/null
+++ b/src/SIMDMath/readme.md
@@ -0,0 +1,60 @@
+# SIMDMath.jl
+
+A lightweight module for explicit vectorization of simple math functions. The focus is mainly on vectorizing polynomial evaluation in two main cases: (1) evaluating many different polynomials of similar length and (2) evaluating a single large polynomial.
+This module is for statically known functions where the coefficients are unrolled and the size of the tuples is known at compilation. For more advanced needs it will be better to use SIMD.jl or LoopVectorization.jl.
+
+### Case 1: Evaluating many different polynomials.
+
+In the evaluation of special functions, we often need to compute many polynomials at the same `x`. An example structure would look like...
+```julia
+const NT = 12
+const P = (
+           ntuple(n -> rand()*(-1)^n / n, NT),
+           ntuple(n -> rand()*(-1)^n / n, NT),
+           ntuple(n -> rand()*(-1)^n / n, NT),
+           ntuple(n -> rand()*(-1)^n / n, NT)
+       )
+
+function test(x)
+           x2 = x * x
+           x3 = x2 * x
+
+           p1 = evalpoly(x3, P[1])
+           p2 = evalpoly(x3, P[2])
+           p3 = evalpoly(x3, P[3])
+           p4 = evalpoly(x3, P[4])
+
+           return muladd(x, -p2, p1), muladd(x2, p4, -p3)
+       end
+```
+This structure is advantageous for vectorizing as `p1`, `p2`, `p3`, and `p4` are independent, require same number of evaluations, and coefficients are statically known.
+However, we are relying on the auto-vectorizer to make sure this happens which is very fragile. In general, two polynomials might auto-vectorizer depending on how the values are used but is not reliable.
+We can check that this function is not vectorizing (though it may on some architectures) by using `@code_llvm test(1.1)` and/or `@code_native(1.1)`.
+
+Another way to test this is to benchmark this function and compare to the time to compute a single polynomial.
+```julia
+julia> @btime test(x) setup=(x=rand()*2)
+  13.026 ns (0 allocations: 0 bytes)
+
+julia> @btime evalpoly(x, P[1]) setup=(x=rand()*2)
+  3.973 ns (0 allocations: 0 bytes)
+```
+In this case, `test` is almost 4x longer as all the polynomial evaluations are happening sequentially.
+
+We can do much better by making sure these polynomials vectorize.
+```julia
+# using the same coefficients as above
+julia> using Bessels.SIMDMath
+
+@inline function test_simd(x)
+       x2 = x * x
+       x3 = x2 * x
+       p = horner_simd(x3, pack_horner(P...))
+       return muladd(x, -p[2].value, p[1].value), muladd(x2, p[4].value, -p[3].value)
+end
+
+julia> @btime test_simd(x) setup=(x=rand()*2)
+  4.440 ns (0 allocations: 0 bytes)
+```
+
+

From a948fbafa31dadaea27f1443aa3605ad610f4681 Mon Sep 17 00:00:00 2001
From: Michael Helton <heltonmc@protonmail.com>
Date: Wed, 8 Mar 2023 15:12:40 -0500
Subject: [PATCH 05/14] add generic size muladd

---
 src/SIMDMath/SIMDMath.jl   |  5 +++++
 src/SIMDMath/arithmetic.jl | 40 +++++++++-----------------------------
 2 files changed, 14 insertions(+), 31 deletions(-)

diff --git a/src/SIMDMath/SIMDMath.jl b/src/SIMDMath/SIMDMath.jl
index 9526dc0..ce57649 100644
--- a/src/SIMDMath/SIMDMath.jl
+++ b/src/SIMDMath/SIMDMath.jl
@@ -26,6 +26,11 @@ const CVec{N, FloatTypes} = NTuple{2, LVec{N, FloatTypes}}
 const ScalarTypes = Union{VE{FloatTypes}, FloatTypes}
 const SIMDLanes = Union{LVec{2, Float64}, LVec{4, Float64}, LVec{8, Float64}}
 
+const LLVMType = Dict{DataType, String}(
+    Float32      => "float",
+    Float64      => "double",
+)
+
 include("arithmetic.jl")
 include("shufflevector.jl")
 include("horner.jl")
diff --git a/src/SIMDMath/arithmetic.jl b/src/SIMDMath/arithmetic.jl
index c844435..269e6fd 100644
--- a/src/SIMDMath/arithmetic.jl
+++ b/src/SIMDMath/arithmetic.jl
@@ -10,35 +10,13 @@ import Base: muladd
 
 # muladd llvm instructions
 
-@inline function _muladd(x::LVec{2, Float64}, y::LVec{2, Float64}, z::LVec{2, Float64})
-    llvmcall("""%4 = fmul contract <2 x double> %0, %1
-                %5 = fadd contract <2 x double> %4, %2
-                ret <2 x double> %5""",
-        LVec{2, Float64},
-        Tuple{LVec{2, Float64}, LVec{2, Float64}, LVec{2, Float64}},
-        x,
-        y,
-        z)
+@generated function _muladd(x::LVec{N, T}, y::LVec{N, T}, z::LVec{N, T}) where {N, T <: FloatTypes}
+    s = """
+        %4 = fmul contract <$N x $(LLVMType[T])> %0, %1
+        %5 = fadd contract <$N x $(LLVMType[T])> %4, %2
+        ret <$N x $(LLVMType[T])> %5
+        """
+    return :(
+        llvmcall($s, LVec{N, T}, Tuple{LVec{N, T}, LVec{N, T}, LVec{N, T}}, x, y, z)
+        )
 end
-
-@inline function _muladd(x::LVec{4, Float64}, y::LVec{4, Float64}, z::LVec{4, Float64})
-    llvmcall("""%4 = fmul contract <4 x double> %0, %1
-                %5 = fadd contract <4 x double> %4, %2
-                ret <4 x double> %5""",
-        LVec{4, Float64},
-        Tuple{LVec{4, Float64}, LVec{4, Float64}, LVec{4, Float64}},
-        x,
-        y,
-        z)
-end
-
-@inline function _muladd(x::LVec{8, Float64}, y::LVec{8, Float64}, z::LVec{8, Float64})
-    llvmcall("""%4 = fmul contract <8 x double> %0, %1
-                %5 = fadd contract <8 x double> %4, %2
-                ret <8 x double> %5""",
-        LVec{8, Float64},
-        Tuple{LVec{8, Float64}, LVec{8, Float64}, LVec{8, Float64}},
-        x,
-        y,
-        z)
-end
\ No newline at end of file

From 0592646ef346db074d31efe70132bb58f91e8c5b Mon Sep 17 00:00:00 2001
From: Michael Helton <heltonmc@protonmail.com>
Date: Wed, 8 Mar 2023 15:22:55 -0500
Subject: [PATCH 06/14] add Float16

---
 src/SIMDMath/SIMDMath.jl   | 7 ++++---
 src/SIMDMath/arithmetic.jl | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/SIMDMath/SIMDMath.jl b/src/SIMDMath/SIMDMath.jl
index ce57649..ace4bfb 100644
--- a/src/SIMDMath/SIMDMath.jl
+++ b/src/SIMDMath/SIMDMath.jl
@@ -16,7 +16,7 @@ export horner, horner2, horner4, horner8
 export pack_horner2, pack_horner4, pack_horner8
 
 const VE = Base.VecElement
-const FloatTypes = Union{Float32, Float64}
+const FloatTypes = Union{Float16, Float32, Float64}
 const LVec{N, FloatTypes} = NTuple{N, VE{FloatTypes}}
 
 ## maybe have a complex type stored as a pack or real and complex values separately
@@ -27,8 +27,9 @@ const ScalarTypes = Union{VE{FloatTypes}, FloatTypes}
 const SIMDLanes = Union{LVec{2, Float64}, LVec{4, Float64}, LVec{8, Float64}}
 
 const LLVMType = Dict{DataType, String}(
-    Float32      => "float",
-    Float64      => "double",
+    Float16  => "half",
+    Float32  => "float",
+    Float64  => "double",
 )
 
 include("arithmetic.jl")
diff --git a/src/SIMDMath/arithmetic.jl b/src/SIMDMath/arithmetic.jl
index 269e6fd..7743634 100644
--- a/src/SIMDMath/arithmetic.jl
+++ b/src/SIMDMath/arithmetic.jl
@@ -10,7 +10,7 @@ import Base: muladd
 
 # muladd llvm instructions
 
-@generated function _muladd(x::LVec{N, T}, y::LVec{N, T}, z::LVec{N, T}) where {N, T <: FloatTypes}
+@inline @generated function _muladd(x::LVec{N, T}, y::LVec{N, T}, z::LVec{N, T}) where {N, T <: FloatTypes}
     s = """
         %4 = fmul contract <$N x $(LLVMType[T])> %0, %1
         %5 = fadd contract <$N x $(LLVMType[T])> %4, %2

From 38aae7c6d8e2ef302ce7012d16b105ef0a2c276b Mon Sep 17 00:00:00 2001
From: Michael Helton <heltonmc@protonmail.com>
Date: Wed, 8 Mar 2023 15:31:17 -0500
Subject: [PATCH 07/14] add generic constant vector [skip ci]

---
 src/SIMDMath/shufflevector.jl | 34 ++++++++++------------------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/src/SIMDMath/shufflevector.jl b/src/SIMDMath/shufflevector.jl
index 3c070ba..b91944b 100644
--- a/src/SIMDMath/shufflevector.jl
+++ b/src/SIMDMath/shufflevector.jl
@@ -1,28 +1,14 @@
 # create tuples of VecElements filled with a constant value of x
 
-@inline constantvector(x::Float64, y) = constantvector(VE(x), y)
+@inline constantvector(x::T, y) where T <: FloatTypes = constantvector(VE(x), y)
 
-@inline function constantvector(x::VecElement{Float64}, y::Type{LVec{2, Float64}})
-    llvmcall("""%2 = insertelement <2 x double> undef, double %0, i32 0
-                %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
-                ret <2 x double> %3""",
-        LVec{2, Float64},
-        Tuple{VecElement{Float64}},
-        x)
-end
-@inline function constantvector(x::VecElement{Float64}, y::Type{LVec{4, Float64}})
-    llvmcall("""%2 = insertelement <4 x double> undef, double %0, i32 0
-                %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer
-                ret <4 x double> %3""",
-        LVec{4, Float64},
-        Tuple{VecElement{Float64}},
-        x)
-end
-@inline function constantvector(x::VecElement{Float64}, y::Type{LVec{8, Float64}})
-    llvmcall("""%2 = insertelement <8 x double> undef, double %0, i32 0
-                %3 = shufflevector <8 x double> %2, <8 x double> undef, <8 x i32> zeroinitializer
-                ret <8 x double> %3""",
-        LVec{8, Float64},
-        Tuple{VecElement{Float64}},
-        x)
+@inline @generated function constantvector(x::VecElement{T}, y::Type{LVec{N, T}}) where {N, T <: FloatTypes}
+    s = """
+        %2 = insertelement <$N x $(LLVMType[T])> undef, $(LLVMType[T]) %0, i32 0
+        %3 = shufflevector <$N x $(LLVMType[T])> %2, <$N x $(LLVMType[T])> undef, <$N x i32> zeroinitializer
+        ret <$N x $(LLVMType[T])> %3
+        """
+    return :(
+        llvmcall($s, LVec{N, T}, Tuple{VecElement{T}}, x)
+        )
 end

From 1b04104891f2f2db06f119d88cd5918691c8476e Mon Sep 17 00:00:00 2001
From: Michael Helton <heltonmc@protonmail.com>
Date: Thu, 9 Mar 2023 11:26:17 -0500
Subject: [PATCH 08/14] add tests and simplify [skip ci]

---
 src/SIMDMath/SIMDMath.jl      |  7 ----
 src/SIMDMath/arithmetic.jl    | 18 ++++++++
 src/SIMDMath/horner.jl        | 43 +++++++++----------
 src/SIMDMath/readme.md        | 20 +++++++++
 src/SIMDMath/shufflevector.jl | 14 -------
 src/SIMDMath/test.jl          | 78 +++++++++++++++++++++++++++++++++++
 6 files changed, 136 insertions(+), 44 deletions(-)
 delete mode 100644 src/SIMDMath/shufflevector.jl
 create mode 100644 src/SIMDMath/test.jl

diff --git a/src/SIMDMath/SIMDMath.jl b/src/SIMDMath/SIMDMath.jl
index ace4bfb..c3d7ceb 100644
--- a/src/SIMDMath/SIMDMath.jl
+++ b/src/SIMDMath/SIMDMath.jl
@@ -18,13 +18,7 @@ export pack_horner2, pack_horner4, pack_horner8
 const VE = Base.VecElement
 const FloatTypes = Union{Float16, Float32, Float64}
 const LVec{N, FloatTypes} = NTuple{N, VE{FloatTypes}}
-
-## maybe have a complex type stored as a pack or real and complex values separately
-const CVec{N, FloatTypes} = NTuple{2, LVec{N, FloatTypes}}
-# CVec{2, Float64}((LVec{2, Float64}((1.1, 1.2)), LVec{2, Float64}((1.5, 1.0))))
-
 const ScalarTypes = Union{VE{FloatTypes}, FloatTypes}
-const SIMDLanes = Union{LVec{2, Float64}, LVec{4, Float64}, LVec{8, Float64}}
 
 const LLVMType = Dict{DataType, String}(
     Float16  => "half",
@@ -33,7 +27,6 @@ const LLVMType = Dict{DataType, String}(
 )
 
 include("arithmetic.jl")
-include("shufflevector.jl")
 include("horner.jl")
 
 end
diff --git a/src/SIMDMath/arithmetic.jl b/src/SIMDMath/arithmetic.jl
index 7743634..4bf847a 100644
--- a/src/SIMDMath/arithmetic.jl
+++ b/src/SIMDMath/arithmetic.jl
@@ -1,5 +1,23 @@
 import Base: muladd
 
+## ShuffleVector
+
+# create tuples of VecElements filled with a constant value of x
+@inline constantvector(x::T, y) where T <: FloatTypes = constantvector(VE(x), y)
+
+@inline @generated function constantvector(x::VecElement{T}, y::Type{LVec{N, T}}) where {N, T <: FloatTypes}
+    s = """
+        %2 = insertelement <$N x $(LLVMType[T])> undef, $(LLVMType[T]) %0, i32 0
+        %3 = shufflevector <$N x $(LLVMType[T])> %2, <$N x $(LLVMType[T])> undef, <$N x i32> zeroinitializer
+        ret <$N x $(LLVMType[T])> %3
+        """
+    return :(
+        llvmcall($s, LVec{N, T}, Tuple{VecElement{T}}, x)
+        )
+end
+
+## muladd
+
 @inline muladd(x::LVec{N, T}, y::LVec{N, T}, z::LVec{N, T}) where {N, T <: FloatTypes} = _muladd(x, y, z)
 @inline muladd(x::ScalarTypes, y::LVec{N, T}, z::LVec{N, T}) where {N, T <: FloatTypes} = muladd(constantvector(x, LVec{N, T}), y, z)
 @inline muladd(x::LVec{N, T}, y::ScalarTypes, z::LVec{N, T}) where {N, T <: FloatTypes} = muladd(x, constantvector(y, LVec{N, T}), z)
diff --git a/src/SIMDMath/horner.jl b/src/SIMDMath/horner.jl
index 5039193..61b9452 100644
--- a/src/SIMDMath/horner.jl
+++ b/src/SIMDMath/horner.jl
@@ -21,10 +21,9 @@
 #
 
 # cast single element `x` to a width of the number of polynomials
-horner_simd(x::Union{T, VE{T}}, p::NTuple{N, LVec{M, T}}) where {N, M, T} = horner_simd(constantvector(x, LVec{M, T}), p)
+@inline horner_simd(x::Union{T, VE{T}}, p::NTuple{N, LVec{M, T}}) where {N, M, T} = horner_simd(constantvector(x, LVec{M, T}), p)
 
-# In theory, this can accept any number of packed polynomials though for SIMD this really should only accept `M = 2, 4, 8` and for Float32 `M = 16`
-@inline function horner_simd(x::LVec{M, Float64}, p::NTuple{N, LVec{M, Float64}}) where {N, M}
+@inline function horner_simd(x::LVec{M, T}, p::NTuple{N, LVec{M, T}}) where {N, M, T <: FloatTypes}
     a = p[end]
     @inbounds for i in N-1:-1:1
         a = muladd(x, a, p[i])
@@ -32,9 +31,7 @@ horner_simd(x::Union{T, VE{T}}, p::NTuple{N, LVec{M, T}}) where {N, M, T} = horn
     return a
 end
 
-@inline pack_horner(p1::NTuple{N, T}, p2::NTuple{N, T}) where {N, T <: Float64} = ntuple(i -> LVec{2, T}((p1[i], p2[i])), Val(N))
-@inline pack_horner(p1::NTuple{N, T}, p2::NTuple{N, T}, p3::NTuple{N, T}, p4::NTuple{N, T}) where {N, T <: Float64} = ntuple(i -> LVec{4, T}((p1[i], p2[i], p3[i], p4[i])), Val(N))
-@inline pack_horner(p1::NTuple{N, T}, p2::NTuple{N, T}, p3::NTuple{N, T}, p4::NTuple{N, T}, p5::NTuple{N, T}, p6::NTuple{N, T}, p7::NTuple{N, T}, p8::NTuple{N, T}) where {N, T <: Float64} = ntuple(i -> LVec{8, T}((p1[i], p2[i], p3[i], p4[i], p5[i], p6[i], p7[i], p8[i])), Val(N))
+@inline pack_horner(P::Tuple{Vararg{NTuple{M, T}, N}}) where {N, M, T} = ntuple(i -> LVec{N, T}((ntuple(j -> P[j][i], Val(N)))), Val(M))
 
 #
 # Horner scheme for evaluating single polynomials
@@ -57,23 +54,23 @@ end
 # horner8 - 8th order horner scheme that splits into a + ix^8 + .... & etc
 
 @inline horner(x::T, P::NTuple{N, T}) where {N, T <: FloatTypes} = evalpoly(x, P)
-@inline horner2(x, P::NTuple{N, Float64}) where N = horner2(x, pack_horner2(P))
-@inline horner4(x, P::NTuple{N, Float64}) where N = horner4(x, pack_horner4(P))
-@inline horner8(x, P::NTuple{N, Float64}) where N = horner8(x, pack_horner8(P))
+@inline horner2(x, P::NTuple{N, T}) where {N, T <: FloatTypes} = horner2(x, pack_horner2(P))
+@inline horner4(x, P::NTuple{N, T}) where {N, T <: FloatTypes} = horner4(x, pack_horner4(P))
+@inline horner8(x, P::NTuple{N, T}) where {N, T <: FloatTypes} = horner8(x, pack_horner8(P))
 
-@inline function horner2(x, P::NTuple{N, LVec{2, Float64}}) where N
+@inline function horner2(x, P::NTuple{N, LVec{2, T}}) where {N, T <: FloatTypes}
     a = horner_simd(x * x, P)
     return muladd(x, a[2].value, a[1].value)
 end
 
-@inline function horner4(x, P::NTuple{N, LVec{4, Float64}}) where N
+@inline function horner4(x, P::NTuple{N, LVec{4, T}}) where {N, T <: FloatTypes}
     xx = x * x
     a = horner_simd(xx * xx, P)
-    b = muladd(x, LVec{2, Float64}((a[4].value, a[2].value)), LVec{2, Float64}((a[3].value, a[1].value)))
+    b = muladd(x, LVec{2, T}((a[4].value, a[2].value)), LVec{2, T}((a[3].value, a[1].value)))
     return muladd(xx, b[1].value, b[2].value) 
 end
 
-@inline function horner8(x, P::NTuple{N, LVec{8, Float64}}) where N
+@inline function horner8(x, P::NTuple{N, LVec{8, T}}) where {N, T <: FloatTypes}
     x2 = x * x
     x4 = x2 * x2
     a = horner_simd(x4 * x4, P)
@@ -81,8 +78,8 @@ end
     # following computes
     # a[1].value + a[2].value*x + a[3].value*x^2 + a[4].value*x^3 + a[5].value*x^4 + a[6].value*x^5 + a[7].value*x^6 + a[8].value*x^7
 
-    b = muladd(x, LVec{4, Float64}((a[4].value, a[2].value, a[8].value, a[6].value)), LVec{4, Float64}((a[3].value, a[1].value, a[7].value, a[5].value)))
-    c = muladd(x2, LVec{2, Float64}((b[1].value, b[3].value)), LVec{2, Float64}((b[2].value, b[4].value)))
+    b = muladd(x, LVec{4, T}((a[4].value, a[2].value, a[8].value, a[6].value)), LVec{4, T}((a[3].value, a[1].value, a[7].value, a[5].value)))
+    c = muladd(x2, LVec{2, T}((b[1].value, b[3].value)), LVec{2, T}((b[2].value, b[4].value)))
     return muladd(x4, c[2].value, c[1].value)
 end
 
@@ -104,23 +101,23 @@ end
 # TODO: Use a generated function instead might make the packing more reliable for all polynomial degrees
 #
 
-@inline function pack_horner2(p::NTuple{N, T}) where {N, T <: Float64}
+@inline function pack_horner2(p::NTuple{N, T}) where {N, T <: FloatTypes}
     rem = N % 2
     pad = !iszero(rem) ? (2 - rem) : 0
-    P = (p..., ntuple(i -> 0.0, Val(pad))...)
-    return ntuple(i -> LVec{2, T}((P[2*i - 1], P[2i])), Val((N + pad) ÷ 2))
+    P = (p..., ntuple(i -> zero(T), Val(pad))...)
+    return ntuple(i -> LVec{2, T}((P[2i - 1], P[2i])), Val((N + pad) ÷ 2))
 end
 
-@inline function pack_horner4(p::NTuple{N, T}) where {N, T <: Float64}
+@inline function pack_horner4(p::NTuple{N, T}) where {N, T <: FloatTypes}
     rem = N % 4
     pad = !iszero(rem) ? (4 - rem) : 0
-    P = (p..., ntuple(i -> 0.0, Val(pad))...)
-    return ntuple(i -> LVec{4, T}((P[4*i - 3], P[4i - 2], P[4i - 1], P[4i])), Val((N + pad) ÷ 4))
+    P = (p..., ntuple(i -> zero(T), Val(pad))...)
+    return ntuple(i -> LVec{4, T}((P[4i - 3], P[4i - 2], P[4i - 1], P[4i])), Val((N + pad) ÷ 4))
 end
 
-@inline function pack_horner8(p::NTuple{N, T}) where {N, T <: Float64}
+@inline function pack_horner8(p::NTuple{N, T}) where {N, T <: FloatTypes}
     rem = N % 8
     pad = !iszero(rem) ? (8 - rem) : 0
-    P = (p..., ntuple(i -> 0.0, Val(pad))...)
+    P = (p..., ntuple(i -> zero(T), Val(pad))...)
     return ntuple(i -> LVec{8, T}((P[8i - 7], P[8i - 6], P[8i - 5], P[8i - 4], P[8i - 3], P[8i - 2], P[8i - 1], P[8i])), Val((N + pad) ÷ 8))
 end
diff --git a/src/SIMDMath/readme.md b/src/SIMDMath/readme.md
index 6f781cc..52c4afd 100644
--- a/src/SIMDMath/readme.md
+++ b/src/SIMDMath/readme.md
@@ -57,4 +57,24 @@ julia> @btime test_simd(x) setup=(x=rand()*2)
   4.440 ns (0 allocations: 0 bytes)
 ```
 
+### Case 2: Evaluating a single polynomial.
+
+In some cases, we are interested in improving the performance when evaluating a single polynomial of larger degree. Horner's scheme is latency bound and for large polynomials (N>10) this can become a large part of the total runtime. In this case we are aiming to improve the performance of the following structure.
+```julia
+const P4 =  ntuple(n -> rand()*(-1)^n / n, 4)
+const P8 =  ntuple(n -> rand()*(-1)^n / n, 8)
+const P16 =  ntuple(n -> rand()*(-1)^n / n, 16)
+const P32 =  ntuple(n -> rand()*(-1)^n / n, 32)
+const P64 =  ntuple(n -> rand()*(-1)^n / n, 64)
+
+@btime evalpoly(x, P4) setup=(x=rand())
+@btime evalpoly(x, P8) setup=(x=rand())
+@btime evalpoly(x, P16) setup=(x=rand())
+@btime evalpoly(x, P32) setup=(x=rand())
+@btime evalpoly(x, P64) setup=(x=rand())
+```
+
+As mentioned, Horner's scheme requires sequential multiply-add instructions that can't be performed in parallel. One way (another way is Estrin's method which we won't discuss) to improve this structure is to break the polynomial down into even and odd polynomials (a second order Horner's scheme) or into larger powers of `x^4` or `x^8` (a fourth and eighth order Horner's scheme) which allow for computing many different polynomials of similar length simultaneously. In some regard, we are just rearranging the coefficients and using the same method as we did in the first case with some additional arithmetic at the end to add all the different polynomials together.
+
+The last fact is important because we are actually increasing the total amount of arithmetic operations needed but increasing by a large amount the number of operations that can happen in parallel. The increased operations make the advantages of this approach less straightforward than the first case which is always superior. The second and perhaps most important point is that floating point arithmetic is not associative so these approaches will give slightly different results as we are adding and multiplying in slightly differnet order.
 
diff --git a/src/SIMDMath/shufflevector.jl b/src/SIMDMath/shufflevector.jl
deleted file mode 100644
index b91944b..0000000
--- a/src/SIMDMath/shufflevector.jl
+++ /dev/null
@@ -1,14 +0,0 @@
-# create tuples of VecElements filled with a constant value of x
-
-@inline constantvector(x::T, y) where T <: FloatTypes = constantvector(VE(x), y)
-
-@inline @generated function constantvector(x::VecElement{T}, y::Type{LVec{N, T}}) where {N, T <: FloatTypes}
-    s = """
-        %2 = insertelement <$N x $(LLVMType[T])> undef, $(LLVMType[T]) %0, i32 0
-        %3 = shufflevector <$N x $(LLVMType[T])> %2, <$N x $(LLVMType[T])> undef, <$N x i32> zeroinitializer
-        ret <$N x $(LLVMType[T])> %3
-        """
-    return :(
-        llvmcall($s, LVec{N, T}, Tuple{VecElement{T}}, x)
-        )
-end
diff --git a/src/SIMDMath/test.jl b/src/SIMDMath/test.jl
new file mode 100644
index 0000000..1f72574
--- /dev/null
+++ b/src/SIMDMath/test.jl
@@ -0,0 +1,78 @@
+using SIMDMath
+using Test
+
+
+# Test horner_simd
+let
+    NT = 12
+    P = (
+                ntuple(n -> rand()*(-1)^n / n, NT),
+                ntuple(n -> rand()*(-1)^n / n, NT),
+                ntuple(n -> rand()*(-1)^n / n, NT),
+                ntuple(n -> rand()*(-1)^n / n, NT)
+            )
+    
+    x = 0.9
+    a = horner_simd(x, pack_horner(P))
+    @test evalpoly(x, P[1]) == a[1].value
+    @test evalpoly(x, P[2]) == a[2].value
+    @test evalpoly(x, P[3]) == a[3].value
+    @test evalpoly(x, P[4]) == a[4].value
+
+    NT = 24
+    P32 = (
+                ntuple(n -> Float32(rand()*(-1)^n / n), NT),
+                ntuple(n -> Float32(rand()*(-1)^n / n), NT),
+                ntuple(n -> Float32(rand()*(-1)^n / n), NT),
+                ntuple(n -> Float32(rand()*(-1)^n / n), NT),
+                ntuple(n -> Float32(rand()*(-1)^n / n), NT),
+                ntuple(n -> Float32(rand()*(-1)^n / n), NT),
+                ntuple(n -> Float32(rand()*(-1)^n / n), NT),
+                ntuple(n -> Float32(rand()*(-1)^n / n), NT),
+            )
+    
+    x = 1.2f0
+    a = horner_simd(x, pack_horner(P32))
+    @test evalpoly(x, P32[1]) == a[1].value
+    @test evalpoly(x, P32[2]) == a[2].value
+    @test evalpoly(x, P32[3]) == a[3].value
+    @test evalpoly(x, P32[4]) == a[4].value
+    @test evalpoly(x, P32[5]) == a[5].value
+    @test evalpoly(x, P32[6]) == a[6].value
+    @test evalpoly(x, P32[7]) == a[7].value
+    @test evalpoly(x, P32[8]) == a[8].value
+
+    NT = 4
+    P16 = (
+                ntuple(n -> Float16(rand()*(-1)^n / n), NT),
+                ntuple(n -> Float16(rand()*(-1)^n / n), NT),
+                ntuple(n -> Float16(rand()*(-1)^n / n), NT),
+            )
+    
+    x = Float16(0.8)
+    a = horner_simd(x, pack_horner(P16))
+    @test evalpoly(x, P16[1]) ≈ a[1].value
+    @test evalpoly(x, P16[2]) ≈ a[2].value
+    @test evalpoly(x, P16[3]) ≈ a[3].value
+
+end
+
+# test second, fourth, and eighth order horner schemes
+# note that the higher order schemes are more prone to overflow when using lower precision
+# because you have to directly compute x^2, x^4, x^8 before the routine
+let
+    for N in [2, 3, 4, 5, 6, 7, 10, 13, 17, 20, 52, 89], x in [0.1, 0.5, 1.5, 4.2, 45.0]
+        poly = ntuple(n -> rand()*(-1)^n / n, N)
+        @test evalpoly(x, poly) ≈ horner2(x, pack_horner2(poly)) ≈ horner4(x, pack_horner4(poly)) ≈ horner8(x, pack_horner8(poly))
+    end
+    for N in [2, 3, 4, 5, 6, 7, 10], x32 in [0.1f0, 0.8f0, 2.4f0, 8.0f0, 25.0f0]
+        poly32 = ntuple(n -> Float32(rand()*(-1)^n / n), N)
+        @test evalpoly(x32, poly32) ≈ horner2(x32, pack_horner2(poly32)) ≈ horner4(x32, pack_horner4(poly32)) ≈ horner8(x32, pack_horner8(poly32))
+    end
+    for N in [2, 3, 4, 5, 6], x in [0.1, 0.5, 2.2]
+        poly16 = ntuple(n -> Float16(rand()*(-1)^n / n), N)
+        x16 = Float16.(x)
+        @test evalpoly(x16, poly16) ≈ horner2(x16, pack_horner2(poly16)) ≈ horner4(x16, pack_horner4(poly16)) ≈ horner8(x16, pack_horner8(poly16))
+    end
+
+end

From 9f67b37e7df830fff624f7759ae264c00bc637f8 Mon Sep 17 00:00:00 2001
From: Michael Helton <heltonmc@protonmail.com>
Date: Thu, 9 Mar 2023 14:24:55 -0500
Subject: [PATCH 09/14] add updated benchmark ex [skip ci]

---
 src/SIMDMath/readme.md | 54 ++++++++++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/src/SIMDMath/readme.md b/src/SIMDMath/readme.md
index 52c4afd..284e406 100644
--- a/src/SIMDMath/readme.md
+++ b/src/SIMDMath/readme.md
@@ -59,22 +59,52 @@ julia> @btime test_simd(x) setup=(x=rand()*2)
 
 ### Case 2: Evaluating a single polynomial.
 
-In some cases, we are interested in improving the performance when evaluating a single polynomial of larger degree. Horner's scheme is latency bound and for large polynomials (N>10) this can become a large part of the total runtime. In this case we are aiming to improve the performance of the following structure.
+In some cases, we are interested in improving the performance when evaluating a single polynomial of larger degree. Horner's scheme is latency bound and for large polynomials (N>10) this can become a large part of the total runtime. We can test the performance of using a straight Horner scheme using the Base library function `evalpoly` against the higher order Horner schemes.
 ```julia
-const P4 =  ntuple(n -> rand()*(-1)^n / n, 4)
-const P8 =  ntuple(n -> rand()*(-1)^n / n, 8)
-const P16 =  ntuple(n -> rand()*(-1)^n / n, 16)
-const P32 =  ntuple(n -> rand()*(-1)^n / n, 32)
-const P64 =  ntuple(n -> rand()*(-1)^n / n, 64)
-
-@btime evalpoly(x, P4) setup=(x=rand())
-@btime evalpoly(x, P8) setup=(x=rand())
-@btime evalpoly(x, P16) setup=(x=rand())
-@btime evalpoly(x, P32) setup=(x=rand())
-@btime evalpoly(x, P64) setup=(x=rand())
+let
+    horner_times = []
+    horner2_times = []
+    horner4_times = []
+    horner8_times = []
+
+    for N in [4, 8, 12, 16, 24, 48, 96, 198]
+        poly = ntuple(n -> rand()*(-1)^n / n, N)
+        poly_packed2 = pack_horner2(poly)
+        poly_packed4 = pack_horner4(poly)
+        poly_packed8 = pack_horner8(poly)
+
+        t1 = @benchmark evalpoly(x, $poly) setup=(x=rand())
+        t2 = @benchmark horner2(x, $poly_packed2) setup=(x=rand())
+        t3 = @benchmark horner4(x, $poly_packed4) setup=(x=rand())
+        t4 = @benchmark horner8(x, $poly_packed8) setup=(x=rand())
+
+        push!(horner_times,  round(minimum(t1).time, digits=3))
+        push!(horner2_times,  round(minimum(t2).time, digits=3))
+        push!(horner4_times,  round(minimum(t3).time, digits=3))
+        push!(horner8_times,  round(minimum(t4).time, digits=3))
+
+    end
+    @show horner_times
+    @show horner2_times
+    @show horner4_times
+    @show horner8_times
+end
 ```
 
 As mentioned, Horner's scheme requires sequential multiply-add instructions that can't be performed in parallel. One way (another way is Estrin's method which we won't discuss) to improve this structure is to break the polynomial down into even and odd polynomials (a second order Horner's scheme) or into larger powers of `x^4` or `x^8` (a fourth and eighth order Horner's scheme) which allow for computing many different polynomials of similar length simultaneously. In some regard, we are just rearranging the coefficients and using the same method as we did in the first case with some additional arithmetic at the end to add all the different polynomials together.
 
 The last fact is important because we are actually increasing the total amount of arithmetic operations needed but increasing by a large amount the number of operations that can happen in parallel. The increased operations make the advantages of this approach less straightforward than the first case which is always superior. The second and perhaps most important point is that floating point arithmetic is not associative so these approaches will give slightly different results as we are adding and multiplying in slightly differnet order.
 
+The above benchmarks are as follows.
+```julia
+Julia Version 1.8.2
+Platform Info:
+  OS: macOS (arm64-apple-darwin21.3.0)
+  CPU: 8 × Apple M1
+
+horner_times  = [2.125, 2.125, 2.416, 2.416, 4.000, 12.345, 40.573, 126.353]
+horner2_times = [2.125, 2.084, 2.125, 2.125, 2.417, 4.125,  24.849, 68.818]
+horner4_times = [2.125, 2.084, 2.125, 2.416, 2.416, 3.291,  8.6340, 29.271]
+horner8_times = [2.416, 2.416, 2.458, 2.416, 2.416, 3.375,  6.5000, 17.41]
+```
+Asymptotically, we can see that the method approaches a 2, 4, and 8x increase respecitively for large degrees, however, for smaller degrees the advanges are more complex. Therefore, it is encouraged to test the performance for individual cases. Of course, this depends on statically knowing the polynomial size during compilation which allows for packing the coefficients in the most efficient way.

From f8f0ba7310c8c9dc35e9ed8eadcc8907059fbc42 Mon Sep 17 00:00:00 2001
From: Michael Helton <heltonmc@protonmail.com>
Date: Thu, 9 Mar 2023 15:27:54 -0500
Subject: [PATCH 10/14] add normalized benchmarks

---
 src/SIMDMath/readme.md | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/src/SIMDMath/readme.md b/src/SIMDMath/readme.md
index 284e406..9330211 100644
--- a/src/SIMDMath/readme.md
+++ b/src/SIMDMath/readme.md
@@ -67,7 +67,7 @@ let
     horner4_times = []
     horner8_times = []
 
-    for N in [4, 8, 12, 16, 24, 48, 96, 198]
+    for N in [4, 8, 12, 16, 32, 64, 128, 256, 512]
         poly = ntuple(n -> rand()*(-1)^n / n, N)
         poly_packed2 = pack_horner2(poly)
         poly_packed4 = pack_horner4(poly)
@@ -84,10 +84,9 @@ let
         push!(horner8_times,  round(minimum(t4).time, digits=3))
 
     end
-    @show horner_times
-    @show horner2_times
-    @show horner4_times
-    @show horner8_times
+    @show horner_times ./ horner2_times
+    @show horner_times ./ horner4_times
+    @show horner_times ./ horner8_times
 end
 ```
 
@@ -97,14 +96,31 @@ The last fact is important because we are actually increasing the total amount o
 
 The above benchmarks are as follows.
 ```julia
+# Relative Speedup compared to evalpoly when evaluating a polynomial of N degree
+# N = [4, 8, 12, 16, 32, 64, 128, 256, 512]
+
+# CPU #1
 Julia Version 1.8.2
 Platform Info:
   OS: macOS (arm64-apple-darwin21.3.0)
   CPU: 8 × Apple M1
 
-horner_times  = [2.125, 2.125, 2.416, 2.416, 4.000, 12.345, 40.573, 126.353]
-horner2_times = [2.125, 2.084, 2.125, 2.125, 2.417, 4.125,  24.849, 68.818]
-horner4_times = [2.125, 2.084, 2.125, 2.416, 2.416, 3.291,  8.6340, 29.271]
-horner8_times = [2.416, 2.416, 2.458, 2.416, 2.416, 3.375,  6.5000, 17.41]
+#   N   = [ 4,    8,    12,   16,   32,   64,   128,  256,  512]
+horner2 = [1.00, 1.00, 1.16, 1.16, 2.57, 3.14,  1.59, 1.76, 2.05]
+horner4 = [1.00, 0.98, 1.14, 1.00, 2.57, 4.23,  3.53, 3.87, 4.57]
+horner8 = [0.88, 0.86, 1.00, 1.00, 2.29, 4.91,  6.21, 7.19, 8.45]
+
+# CPU #2
+julia> versioninfo()
+Julia Version 1.8.2
+Commit 36034abf260 (2022-09-29 15:21 UTC)
+Platform Info:
+  OS: Linux (x86_64-linux-gnu)
+  CPU: 12 × Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz
+
+#   N   = [ 4,    8,    12,   16,   32,   64,   128,  256,  512]
+horner2 = [1.00, 1.11, 1.30, 1.37, 2.61, 2.11, 2.09, 2.59, 2.27]
+horner4 = [1.00, 0.93, 1.21, 1.44, 3.30, 5.77, 4.69, 5.41, 5.65]
+horner8 = [0.68, 0.84, 0.95, 1.26, 3.05, 6.37, 6.98, 8.76, 9.45]
 ```
 Asymptotically, we can see that the method approaches a 2, 4, and 8x increase respecitively for large degrees, however, for smaller degrees the advanges are more complex. Therefore, it is encouraged to test the performance for individual cases. Of course, this depends on statically knowing the polynomial size during compilation which allows for packing the coefficients in the most efficient way.

From 7a244aed78cf1ad17c1bc69de2fe4fc83306be02 Mon Sep 17 00:00:00 2001
From: Michael Helton <heltonmc@protonmail.com>
Date: Fri, 17 Mar 2023 11:02:54 -0400
Subject: [PATCH 11/14] add SIMD clenshaw algorithm

---
 src/SIMDMath/SIMDMath.jl                      |  2 ++
 src/SIMDMath/arithmetic.jl                    | 13 +++++++
 src/SIMDMath/horner.jl                        | 12 +++++++
 src/SIMDMath/test.jl => test/SIMDMath_test.jl | 36 ++++++++++++++++++-
 test/runtests.jl                              |  1 +
 5 files changed, 63 insertions(+), 1 deletion(-)
 rename src/SIMDMath/test.jl => test/SIMDMath_test.jl (71%)

diff --git a/src/SIMDMath/SIMDMath.jl b/src/SIMDMath/SIMDMath.jl
index c3d7ceb..08efbc4 100644
--- a/src/SIMDMath/SIMDMath.jl
+++ b/src/SIMDMath/SIMDMath.jl
@@ -15,6 +15,8 @@ export horner_simd, pack_horner
 export horner, horner2, horner4, horner8
 export pack_horner2, pack_horner4, pack_horner8
 
+export clenshaw_simd
+
 const VE = Base.VecElement
 const FloatTypes = Union{Float16, Float32, Float64}
 const LVec{N, FloatTypes} = NTuple{N, VE{FloatTypes}}
diff --git a/src/SIMDMath/arithmetic.jl b/src/SIMDMath/arithmetic.jl
index 4bf847a..0612d01 100644
--- a/src/SIMDMath/arithmetic.jl
+++ b/src/SIMDMath/arithmetic.jl
@@ -38,3 +38,16 @@ end
         llvmcall($s, LVec{N, T}, Tuple{LVec{N, T}, LVec{N, T}, LVec{N, T}}, x, y, z)
         )
 end
+
+for f in (:fadd, :fsub, :fmul, :fdiv)
+    @eval @inline @generated function $f(x::LVec{N, T}, y::LVec{N, T}) where {N, T <: FloatTypes}
+        ff = $(QuoteNode(f))
+        s = """
+        %3 = $ff <$N x $(LLVMType[T])> %0, %1
+        ret <$N x $(LLVMType[T])> %3
+        """
+        return :(
+        llvmcall($s, LVec{N, T}, Tuple{LVec{N, T}, LVec{N, T}}, x, y)
+        )
+    end
+end
diff --git a/src/SIMDMath/horner.jl b/src/SIMDMath/horner.jl
index 61b9452..646be55 100644
--- a/src/SIMDMath/horner.jl
+++ b/src/SIMDMath/horner.jl
@@ -33,6 +33,18 @@ end
 
 @inline pack_horner(P::Tuple{Vararg{NTuple{M, T}, N}}) where {N, M, T} = ntuple(i -> LVec{N, T}((ntuple(j -> P[j][i], Val(N)))), Val(M))
 
+# need to add multiply/add/subtract commands
+# finish of clenshaw_simd algorithm
+@inline function clenshaw_simd(x::T, c::NTuple{N, LVec{M, T}}) where {N, M, T <: FloatTypes}
+    x2 = constantvector(2*x, LVec{M, T})
+    c0 = c[end-1]
+    c1 = c[end]
+    @inbounds for i in length(c)-2:-1:1
+        c0, c1 = fsub(c[i], c1), muladd(x2, c1, c0)
+    end
+    return muladd(x, c1, c0)
+end
+
 #
 # Horner scheme for evaluating single polynomials
 #
diff --git a/src/SIMDMath/test.jl b/test/SIMDMath_test.jl
similarity index 71%
rename from src/SIMDMath/test.jl
rename to test/SIMDMath_test.jl
index 1f72574..6c940e3 100644
--- a/src/SIMDMath/test.jl
+++ b/test/SIMDMath_test.jl
@@ -1,4 +1,4 @@
-using SIMDMath
+using Bessels.SIMDMath
 using Test
 
 
@@ -76,3 +76,37 @@ let
     end
 
 end
+
+# Tests for Clenshaw algorithm to evaluate Chebyshev polynomials
+
+# non-SIMD version
+function clen(x, c)
+    x2 = 2x
+    c0 = c[end-1]
+    c1 = c[end]
+    for i in length(c)-2:-1:1
+        c0, c1 = c[i] - c1, c0 + c1 * x2
+    end
+    return c0 + c1 * x
+end
+
+let
+    for N in [2, 6, 10], x in [0.1, 0.5, 1.5, 4.2, 45.0]
+        P = (
+            ntuple(n -> rand()*(-1)^n / n, N),
+            ntuple(n -> rand()*(-1)^n / n, N),
+            ntuple(n -> rand()*(-1)^n / n, N),
+            ntuple(n -> rand()*(-1)^n / n, N)
+        )
+        # the native code generated between the SIMD and non-simd cases
+        # is slightly different due to the SIMD case always using the fsub instruction
+        # where the non-simd case sometimes chooses to reorder this in the native code generation
+        # some small tests showed the SIMD case ordering was slightly more accurate
+        # the SIMD case using this instruction is also faster than even a single evaluation
+        a = clenshaw_simd(x, pack_horner(P))
+        @test clen(x, P[1]) ≈ a[1].value
+        @test clen(x, P[2]) ≈ a[2].value
+        @test clen(x, P[3]) ≈ a[3].value
+        @test clen(x, P[4]) ≈ a[4].value
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 69e6085..8452f2b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,3 +10,4 @@ import SpecialFunctions
 @time @testset "gamma" begin include("gamma_test.jl") end
 @time @testset "airy" begin include("airy_test.jl") end
 @time @testset "sphericalbessel" begin include("sphericalbessel_test.jl") end
+@time @testset "SIMDMath" begin include("SIMDMath_test.jl") end

From 288a7b3281e9a988b189fccf408b7300f38b95af Mon Sep 17 00:00:00 2001
From: Michael Helton <heltonmc@protonmail.com>
Date: Fri, 17 Mar 2023 14:30:08 -0400
Subject: [PATCH 12/14] reformat

---
 src/AiryFunctions/AiryFunctions.jl   | 21 +++++++++++++++++++++
 src/{Airy => AiryFunctions}/airy.jl  |  0
 src/AiryFunctions/airy_polys.jl      |  0
 src/{Airy => AiryFunctions}/cairy.jl |  0
 src/Bessels.jl                       | 13 +++++++------
 src/{misc.jl => Math/Math.jl}        | 10 ++++++++++
 src/{ => Math}/math_constants.jl     |  0
 test/runtests.jl                     | 16 ++++++++--------
 8 files changed, 46 insertions(+), 14 deletions(-)
 create mode 100644 src/AiryFunctions/AiryFunctions.jl
 rename src/{Airy => AiryFunctions}/airy.jl (100%)
 create mode 100644 src/AiryFunctions/airy_polys.jl
 rename src/{Airy => AiryFunctions}/cairy.jl (100%)
 rename src/{misc.jl => Math/Math.jl} (92%)
 rename src/{ => Math}/math_constants.jl (100%)

diff --git a/src/AiryFunctions/AiryFunctions.jl b/src/AiryFunctions/AiryFunctions.jl
new file mode 100644
index 0000000..dfd209d
--- /dev/null
+++ b/src/AiryFunctions/AiryFunctions.jl
@@ -0,0 +1,21 @@
+module AiryFunctions
+
+using ..SIMDMath
+using ..Bessels: ComplexOrReal
+using ..Math: GAMMA_TWO_THIRDS, GAMMA_ONE_THIRD, GAMMA_ONE_SIXTH, GAMMA_FIVE_SIXTHS
+using ..Math: PIPOW3O2, ONEOSQPI
+
+export airyai
+export airyaix
+export airyaiprime
+export airyaiprimex
+export airybi
+export airybix
+export airybiprime
+export airybiprimex
+
+include("airy.jl")
+include("cairy.jl")
+include("airy_polys.jl")
+
+end
diff --git a/src/Airy/airy.jl b/src/AiryFunctions/airy.jl
similarity index 100%
rename from src/Airy/airy.jl
rename to src/AiryFunctions/airy.jl
diff --git a/src/AiryFunctions/airy_polys.jl b/src/AiryFunctions/airy_polys.jl
new file mode 100644
index 0000000..e69de29
diff --git a/src/Airy/cairy.jl b/src/AiryFunctions/cairy.jl
similarity index 100%
rename from src/Airy/cairy.jl
rename to src/AiryFunctions/cairy.jl
diff --git a/src/Bessels.jl b/src/Bessels.jl
index 84d42d2..4ff41b2 100644
--- a/src/Bessels.jl
+++ b/src/Bessels.jl
@@ -1,5 +1,7 @@
 module Bessels
 
+using .AiryFunctions
+
 export besselj0
 export besselj1
 export besselj
@@ -45,11 +47,14 @@ include("besselj.jl")
 include("besselk.jl")
 include("bessely.jl")
 include("hankel.jl")
-include("Airy/airy.jl")
-include("Airy/cairy.jl")
 include("sphericalbessel.jl")
 include("modifiedsphericalbessel.jl")
 
+include("SIMDMath/SIMDMath.jl")
+include("Math/Math.jl")
+
+include("AiryFunctions/AiryFunctions.jl")
+
 include("Float128/besseli.jl")
 include("Float128/besselj.jl")
 include("Float128/besselk.jl")
@@ -57,16 +62,12 @@ include("Float128/bessely.jl")
 include("Float128/constants.jl")
 
 include("constants.jl")
-include("math_constants.jl")
 include("U_polynomials.jl")
 include("recurrence.jl")
-include("misc.jl")
 include("Polynomials/besselj_polys.jl")
 include("asymptotics.jl")
 include("gamma.jl")
 
-include("SIMDMath/SIMDMath.jl")
-
 precompile(besselj, (Float64, Float64))
 
 end
diff --git a/src/misc.jl b/src/Math/Math.jl
similarity index 92%
rename from src/misc.jl
rename to src/Math/Math.jl
index 6be8c72..3ad80da 100644
--- a/src/misc.jl
+++ b/src/Math/Math.jl
@@ -1,3 +1,9 @@
+module Math
+
+# constains miscelaneous math functions and math constants
+
+include("math_constants.jl")
+
 # function to more accurately compute cos(x + xn)
 # see https://github.com/heltonmc/Bessels.jl/pull/13
 # written by @oscardssmith
@@ -18,6 +24,7 @@ function cos_sum(x, xn)
         return Base.Math.sin_kernel(y)
     end
 end
+
 # function to more accurately compute sin(x + xn)
 function sin_sum(x, xn)
     s = x + xn
@@ -36,6 +43,7 @@ function sin_sum(x, xn)
         return -Base.Math.cos_kernel(y)
     end
 end
+
 @inline function reduce_pi02_med(x::Float64)
     pio2_1 = 1.57079632673412561417e+00
 
@@ -45,3 +53,5 @@ end
     y = r-w
     return unsafe_trunc(Int, fn), Base.Math.DoubleFloat64(y, (r-y)-w)
 end
+
+end
diff --git a/src/math_constants.jl b/src/Math/math_constants.jl
similarity index 100%
rename from src/math_constants.jl
rename to src/Math/math_constants.jl
diff --git a/test/runtests.jl b/test/runtests.jl
index 8452f2b..6d713e0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,12 +2,12 @@ using Bessels
 using Test
 import SpecialFunctions
 
-@time @testset "besseli" begin include("besseli_test.jl") end
-@time @testset "besselk" begin include("besselk_test.jl") end
-@time @testset "besselj" begin include("besselj_test.jl") end
-@time @testset "bessely" begin include("bessely_test.jl") end
-@time @testset "hankel" begin include("hankel_test.jl") end
-@time @testset "gamma" begin include("gamma_test.jl") end
+#@time @testset "besseli" begin include("besseli_test.jl") end
+#@time @testset "besselk" begin include("besselk_test.jl") end
+#@time @testset "besselj" begin include("besselj_test.jl") end
+#@time @testset "bessely" begin include("bessely_test.jl") end
+#@time @testset "hankel" begin include("hankel_test.jl") end
+#@time @testset "gamma" begin include("gamma_test.jl") end
 @time @testset "airy" begin include("airy_test.jl") end
-@time @testset "sphericalbessel" begin include("sphericalbessel_test.jl") end
-@time @testset "SIMDMath" begin include("SIMDMath_test.jl") end
+#@time @testset "sphericalbessel" begin include("sphericalbessel_test.jl") end
+#@time @testset "SIMDMath" begin include("SIMDMath_test.jl") end

From 733a50d6c5b6d0bb318042692d0cea424dbd135c Mon Sep 17 00:00:00 2001
From: Michael Helton <heltonmc@protonmail.com>
Date: Fri, 17 Mar 2023 15:47:38 -0400
Subject: [PATCH 13/14] refactor into modules

---
 src/AiryFunctions/AiryFunctions.jl            |   5 +-
 src/AiryFunctions/cairy.jl                    |   2 +
 src/BesselFunctions/BesselFunctions.jl        |  61 +++++++
 src/{ => BesselFunctions}/Float128/besselj.jl |   0
 src/{ => BesselFunctions}/Float128/bessely.jl |   0
 .../Float128/constants.jl                     |   0
 .../Polynomials/besselj_polys.jl              |   0
 src/{ => BesselFunctions}/U_polynomials.jl    |   0
 src/{ => BesselFunctions}/asymptotics.jl      |   0
 src/{ => BesselFunctions}/besseli.jl          |   0
 src/{ => BesselFunctions}/besselj.jl          |   0
 src/{ => BesselFunctions}/besselk.jl          |   0
 src/{ => BesselFunctions}/bessely.jl          |  11 --
 src/{ => BesselFunctions}/constants.jl        |   0
 src/{ => BesselFunctions}/hankel.jl           |   0
 .../modifiedsphericalbessel.jl                |   0
 src/{ => BesselFunctions}/recurrence.jl       |   0
 src/{ => BesselFunctions}/sphericalbessel.jl  |   0
 src/Bessels.jl                                |  32 +---
 src/Float128/besseli.jl                       |   0
 src/Float128/besselk.jl                       |   0
 src/GammaFunctions/GammaFunctions.jl          |   9 +
 src/{ => GammaFunctions}/gamma.jl             |   0
 src/Math/Math.jl                              |  14 ++
 src/Math/math_constants.jl                    |   6 +
 test/besselj_test.jl                          |  20 +--
 test/bessely_test.jl                          |   4 +-
 test/gamma_test.jl                            |  14 +-
 test/runtests.jl                              |  16 +-
 test/sphericalbessel_test.jl                  | 160 +++++++++---------
 30 files changed, 210 insertions(+), 144 deletions(-)
 create mode 100644 src/BesselFunctions/BesselFunctions.jl
 rename src/{ => BesselFunctions}/Float128/besselj.jl (100%)
 rename src/{ => BesselFunctions}/Float128/bessely.jl (100%)
 rename src/{ => BesselFunctions}/Float128/constants.jl (100%)
 rename src/{ => BesselFunctions}/Polynomials/besselj_polys.jl (100%)
 rename src/{ => BesselFunctions}/U_polynomials.jl (100%)
 rename src/{ => BesselFunctions}/asymptotics.jl (100%)
 rename src/{ => BesselFunctions}/besseli.jl (100%)
 rename src/{ => BesselFunctions}/besselj.jl (100%)
 rename src/{ => BesselFunctions}/besselk.jl (100%)
 rename src/{ => BesselFunctions}/bessely.jl (99%)
 rename src/{ => BesselFunctions}/constants.jl (100%)
 rename src/{ => BesselFunctions}/hankel.jl (100%)
 rename src/{ => BesselFunctions}/modifiedsphericalbessel.jl (100%)
 rename src/{ => BesselFunctions}/recurrence.jl (100%)
 rename src/{ => BesselFunctions}/sphericalbessel.jl (100%)
 delete mode 100644 src/Float128/besseli.jl
 delete mode 100644 src/Float128/besselk.jl
 create mode 100644 src/GammaFunctions/GammaFunctions.jl
 rename src/{ => GammaFunctions}/gamma.jl (100%)

diff --git a/src/AiryFunctions/AiryFunctions.jl b/src/AiryFunctions/AiryFunctions.jl
index dfd209d..b5fd1dd 100644
--- a/src/AiryFunctions/AiryFunctions.jl
+++ b/src/AiryFunctions/AiryFunctions.jl
@@ -1,9 +1,8 @@
 module AiryFunctions
 
-using ..SIMDMath
+using ..GammaFunctions
 using ..Bessels: ComplexOrReal
-using ..Math: GAMMA_TWO_THIRDS, GAMMA_ONE_THIRD, GAMMA_ONE_SIXTH, GAMMA_FIVE_SIXTHS
-using ..Math: PIPOW3O2, ONEOSQPI
+using ..Math
 
 export airyai
 export airyaix
diff --git a/src/AiryFunctions/cairy.jl b/src/AiryFunctions/cairy.jl
index 6dff2af..d50692b 100644
--- a/src/AiryFunctions/cairy.jl
+++ b/src/AiryFunctions/cairy.jl
@@ -21,6 +21,8 @@
 # [1] Jentschura, Ulrich David, and E. Lötstedt. "Numerical calculation of Bessel, Hankel and Airy functions." 
 #     Computer Physics Communications 183.3 (2012): 506-519.
 
+using ..BesselFunctions: besseli_power_series_inu_inum1, besselk_down_recurrence, besselk_ratio_knu_knup1
+
 """
     airyai(z)
 
diff --git a/src/BesselFunctions/BesselFunctions.jl b/src/BesselFunctions/BesselFunctions.jl
new file mode 100644
index 0000000..c85e4fe
--- /dev/null
+++ b/src/BesselFunctions/BesselFunctions.jl
@@ -0,0 +1,61 @@
+module BesselFunctions
+
+using ..Bessels: ComplexOrReal
+using ..GammaFunctions
+using ..Math
+
+export besselj0
+export besselj1
+export besselj
+export besselj!
+
+export bessely0
+export bessely1
+export bessely
+export bessely!
+
+export sphericalbesselj
+export sphericalbessely
+export sphericalbesseli
+export sphericalbesselk
+
+export besseli
+export besselix
+export besseli0
+export besseli0x
+export besseli1
+export besseli1x
+export besseli!
+
+export besselk
+export besselkx
+export besselk0
+export besselk0x
+export besselk1
+export besselk1x
+export besselk!
+
+export besselh
+export hankelh1
+export hankelh2
+
+include("besseli.jl")
+include("besselj.jl")
+include("besselk.jl")
+include("bessely.jl")
+include("hankel.jl")
+include("sphericalbessel.jl")
+include("modifiedsphericalbessel.jl")
+
+
+include("constants.jl")
+include("U_polynomials.jl")
+include("recurrence.jl")
+include("Polynomials/besselj_polys.jl")
+include("asymptotics.jl")
+
+include("Float128/constants.jl")
+include("Float128/besselj.jl")
+include("Float128/bessely.jl")
+
+end
diff --git a/src/Float128/besselj.jl b/src/BesselFunctions/Float128/besselj.jl
similarity index 100%
rename from src/Float128/besselj.jl
rename to src/BesselFunctions/Float128/besselj.jl
diff --git a/src/Float128/bessely.jl b/src/BesselFunctions/Float128/bessely.jl
similarity index 100%
rename from src/Float128/bessely.jl
rename to src/BesselFunctions/Float128/bessely.jl
diff --git a/src/Float128/constants.jl b/src/BesselFunctions/Float128/constants.jl
similarity index 100%
rename from src/Float128/constants.jl
rename to src/BesselFunctions/Float128/constants.jl
diff --git a/src/Polynomials/besselj_polys.jl b/src/BesselFunctions/Polynomials/besselj_polys.jl
similarity index 100%
rename from src/Polynomials/besselj_polys.jl
rename to src/BesselFunctions/Polynomials/besselj_polys.jl
diff --git a/src/U_polynomials.jl b/src/BesselFunctions/U_polynomials.jl
similarity index 100%
rename from src/U_polynomials.jl
rename to src/BesselFunctions/U_polynomials.jl
diff --git a/src/asymptotics.jl b/src/BesselFunctions/asymptotics.jl
similarity index 100%
rename from src/asymptotics.jl
rename to src/BesselFunctions/asymptotics.jl
diff --git a/src/besseli.jl b/src/BesselFunctions/besseli.jl
similarity index 100%
rename from src/besseli.jl
rename to src/BesselFunctions/besseli.jl
diff --git a/src/besselj.jl b/src/BesselFunctions/besselj.jl
similarity index 100%
rename from src/besselj.jl
rename to src/BesselFunctions/besselj.jl
diff --git a/src/besselk.jl b/src/BesselFunctions/besselk.jl
similarity index 100%
rename from src/besselk.jl
rename to src/BesselFunctions/besselk.jl
diff --git a/src/bessely.jl b/src/BesselFunctions/bessely.jl
similarity index 99%
rename from src/bessely.jl
rename to src/BesselFunctions/bessely.jl
index fd50753..4671f51 100644
--- a/src/bessely.jl
+++ b/src/BesselFunctions/bessely.jl
@@ -508,17 +508,6 @@ function bessely_chebyshev_low_orders(v, x)
     return clenshaw_chebyshev(v1, a), clenshaw_chebyshev(v2, a)
 end
 
-# uses the Clenshaw algorithm to recursively evaluate a linear combination of Chebyshev polynomials
-function clenshaw_chebyshev(x, c)
-    x2 = 2x
-    c0 = c[end-1]
-    c1 = c[end]
-    for i in length(c)-2:-1:1
-        c0, c1 = c[i] - c1, c0 + c1 * x2
-    end
-    return c0 + c1 * x
-end
-
 # to generate the Chebyshev weights
 #=
 using ArbNumerics, FastChebInterp
diff --git a/src/constants.jl b/src/BesselFunctions/constants.jl
similarity index 100%
rename from src/constants.jl
rename to src/BesselFunctions/constants.jl
diff --git a/src/hankel.jl b/src/BesselFunctions/hankel.jl
similarity index 100%
rename from src/hankel.jl
rename to src/BesselFunctions/hankel.jl
diff --git a/src/modifiedsphericalbessel.jl b/src/BesselFunctions/modifiedsphericalbessel.jl
similarity index 100%
rename from src/modifiedsphericalbessel.jl
rename to src/BesselFunctions/modifiedsphericalbessel.jl
diff --git a/src/recurrence.jl b/src/BesselFunctions/recurrence.jl
similarity index 100%
rename from src/recurrence.jl
rename to src/BesselFunctions/recurrence.jl
diff --git a/src/sphericalbessel.jl b/src/BesselFunctions/sphericalbessel.jl
similarity index 100%
rename from src/sphericalbessel.jl
rename to src/BesselFunctions/sphericalbessel.jl
diff --git a/src/Bessels.jl b/src/Bessels.jl
index 4ff41b2..3fb07c6 100644
--- a/src/Bessels.jl
+++ b/src/Bessels.jl
@@ -1,7 +1,5 @@
 module Bessels
 
-using .AiryFunctions
-
 export besselj0
 export besselj1
 export besselj
@@ -12,6 +10,8 @@ export bessely
 
 export sphericalbesselj
 export sphericalbessely
+export sphericalbesseli
+export sphericalbesselk
 
 export besseli
 export besselix
@@ -40,33 +40,19 @@ export airybix
 export airybiprime
 export airybiprimex
 
-const ComplexOrReal{T} = Union{T,Complex{T}}
+export gamma
 
-include("besseli.jl")
-include("besselj.jl")
-include("besselk.jl")
-include("bessely.jl")
-include("hankel.jl")
-include("sphericalbessel.jl")
-include("modifiedsphericalbessel.jl")
+const ComplexOrReal{T} = Union{T,Complex{T}}
 
 include("SIMDMath/SIMDMath.jl")
 include("Math/Math.jl")
-
+include("GammaFunctions/GammaFunctions.jl")
+include("BesselFunctions/BesselFunctions.jl")
 include("AiryFunctions/AiryFunctions.jl")
 
-include("Float128/besseli.jl")
-include("Float128/besselj.jl")
-include("Float128/besselk.jl")
-include("Float128/bessely.jl")
-include("Float128/constants.jl")
-
-include("constants.jl")
-include("U_polynomials.jl")
-include("recurrence.jl")
-include("Polynomials/besselj_polys.jl")
-include("asymptotics.jl")
-include("gamma.jl")
+using .GammaFunctions
+using .BesselFunctions
+using .AiryFunctions
 
 precompile(besselj, (Float64, Float64))
 
diff --git a/src/Float128/besseli.jl b/src/Float128/besseli.jl
deleted file mode 100644
index e69de29..0000000
diff --git a/src/Float128/besselk.jl b/src/Float128/besselk.jl
deleted file mode 100644
index e69de29..0000000
diff --git a/src/GammaFunctions/GammaFunctions.jl b/src/GammaFunctions/GammaFunctions.jl
new file mode 100644
index 0000000..39a0399
--- /dev/null
+++ b/src/GammaFunctions/GammaFunctions.jl
@@ -0,0 +1,9 @@
+module GammaFunctions
+
+using ..Math: SQ2PI
+
+export gamma
+
+include("gamma.jl")
+
+end
diff --git a/src/gamma.jl b/src/GammaFunctions/gamma.jl
similarity index 100%
rename from src/gamma.jl
rename to src/GammaFunctions/gamma.jl
diff --git a/src/Math/Math.jl b/src/Math/Math.jl
index 3ad80da..21ae7f8 100644
--- a/src/Math/Math.jl
+++ b/src/Math/Math.jl
@@ -2,6 +2,9 @@ module Math
 
 # constains miscelaneous math functions and math constants
 
+export cos_sum, sin_sum
+export clenshaw_chebyshev
+
 include("math_constants.jl")
 
 # function to more accurately compute cos(x + xn)
@@ -54,4 +57,15 @@ end
     return unsafe_trunc(Int, fn), Base.Math.DoubleFloat64(y, (r-y)-w)
 end
 
+# uses the Clenshaw algorithm to recursively evaluate a linear combination of Chebyshev polynomials
+function clenshaw_chebyshev(x, c)
+    x2 = 2x
+    c0 = c[end-1]
+    c1 = c[end]
+    for i in length(c)-2:-1:1
+        c0, c1 = c[i] - c1, c0 + c1 * x2
+    end
+    return c0 + c1 * x
+end
+
 end
diff --git a/src/Math/math_constants.jl b/src/Math/math_constants.jl
index 1dca3d3..f25b520 100644
--- a/src/Math/math_constants.jl
+++ b/src/Math/math_constants.jl
@@ -1,3 +1,9 @@
+export ONEOSQPI, TWOOPI, PIO2, SQPIO2, SQ1O2PI, SQ2OPI
+export THPIO4, SQ2PI, PIPOW3O2, PIO4, SQ2O2
+
+export GAMMA_TWO_THIRDS, GAMMA_ONE_THIRD, GAMMA_ONE_SIXTH, GAMMA_FIVE_SIXTHS
+
+
 const ONEOSQPI(::Type{BigFloat}) = big"5.6418958354775628694807945156077258584405E-1"
 const TWOOPI(::Type{BigFloat}) = big"6.3661977236758134307553505349005744813784E-1"
 const PIO2(::Type{BigFloat}) = big"1.570796326794896619231321691639751442098584699687552910487472296153908203143099"
diff --git a/test/besselj_test.jl b/test/besselj_test.jl
index 523010a..0cce37f 100644
--- a/test/besselj_test.jl
+++ b/test/besselj_test.jl
@@ -102,8 +102,8 @@ for v in nu, xx in x
     xx *= v
     sf = SpecialFunctions.besselj(BigFloat(v), BigFloat(xx))
     @test isapprox(besselj(v, xx), Float64(sf), rtol=5e-14)
-    @test isapprox(Bessels.besseljy_positive_args(v, xx)[1], Float64(sf), rtol=5e-14)
-    @test isapprox(Bessels.besselj(Float32(v), Float32(xx)), Float32(sf))
+    @test isapprox(Bessels.BesselFunctions.besseljy_positive_args(v, xx)[1], Float64(sf), rtol=5e-14)
+    @test isapprox(besselj(Float32(v), Float32(xx)), Float32(sf))
 end
 
 # test half orders (SpecialFunctions does not give big float precision)
@@ -115,8 +115,8 @@ for v in nu, xx in x
     xx *= v
     sf = SpecialFunctions.besselj(v, xx)
     @test isapprox(besselj(v, xx), sf, rtol=1e-12)
-    @test isapprox(Bessels.besseljy_positive_args(v, xx)[1], sf, rtol=1e-12)
-    @test isapprox(Bessels.besselj(Float32(v), Float32(xx)), Float32(sf))
+    @test isapprox(Bessels.BesselFunctions.besseljy_positive_args(v, xx)[1], sf, rtol=1e-12)
+    @test isapprox(besselj(Float32(v), Float32(xx)), Float32(sf))
 end
 
 ## test large orders
@@ -126,7 +126,7 @@ for v in nu, xx in x
     xx *= v
     sf = SpecialFunctions.besselj(v, xx)
     @test isapprox(besselj(v, xx), sf, rtol=5e-11)
-    @test isapprox(Bessels.besseljy_positive_args(v, xx)[1], Float64(sf), rtol=5e-11)
+    @test isapprox(Bessels.BesselFunctions.besseljy_positive_args(v, xx)[1], Float64(sf), rtol=5e-11)
 end
 
 # test nu_range
@@ -143,13 +143,13 @@ end
 
 ## test large arguments
 @test isapprox(besselj(10.0, 150.0), SpecialFunctions.besselj(10.0, 150.0), rtol=1e-12)
-@test isapprox(Bessels.besseljy_large_argument(15.0, 100.0)[1], SpecialFunctions.besselj(15.0, 100.0), rtol=1e-12)
-@test isapprox(Bessels.besseljy_large_argument(15.0, 45.0)[1], SpecialFunctions.besselj(15.0, 45.0), rtol=1e-12)
-@test isapprox(Bessels.besseljy_large_argument(15.0, 25.5)[1], SpecialFunctions.besselj(15.0, 25.5), rtol=1e-12)
+@test isapprox(Bessels.BesselFunctions.besseljy_large_argument(15.0, 100.0)[1], SpecialFunctions.besselj(15.0, 100.0), rtol=1e-12)
+@test isapprox(Bessels.BesselFunctions.besseljy_large_argument(15.0, 45.0)[1], SpecialFunctions.besselj(15.0, 45.0), rtol=1e-12)
+@test isapprox(Bessels.BesselFunctions.besseljy_large_argument(15.0, 25.5)[1], SpecialFunctions.besselj(15.0, 25.5), rtol=1e-12)
 
 # test BigFloat for single point
-@test isapprox(Bessels.besseljy_debye(big"2000", big"1500.0")[1], SpecialFunctions.besselj(big"2000", big"1500"), rtol=5e-20)
-@test isapprox(Bessels.besseljy_large_argument(big"20", big"1500.0")[1], SpecialFunctions.besselj(big"20", big"1500"), rtol=5e-20)
+@test isapprox(Bessels.BesselFunctions.besseljy_debye(big"2000", big"1500.0")[1], SpecialFunctions.besselj(big"2000", big"1500"), rtol=5e-20)
+@test isapprox(Bessels.BesselFunctions.besseljy_large_argument(big"20", big"1500.0")[1], SpecialFunctions.besselj(big"20", big"1500"), rtol=5e-20)
 
 # need to test accuracy of negative orders and negative arguments and all combinations within
 # SpecialFunctions.jl doesn't provide these so will hand check against hard values
diff --git a/test/bessely_test.jl b/test/bessely_test.jl
index 618bbc6..90ae7ff 100644
--- a/test/bessely_test.jl
+++ b/test/bessely_test.jl
@@ -82,7 +82,7 @@ for v in nu, xx in x
     xx *= v
     sf = SpecialFunctions.bessely(BigFloat(v), BigFloat(xx))
     @test isapprox(bessely(v, xx), Float64(sf), rtol=2e-13)
-    @test isapprox(Bessels.besseljy_positive_args(v, xx)[2], Float64(sf), rtol=5e-12)
+    @test isapprox(Bessels.BesselFunctions.besseljy_positive_args(v, xx)[2], Float64(sf), rtol=5e-12)
     @test isapprox(bessely(Float32(v), Float32(xx)), Float32(sf))
 end
 
@@ -95,7 +95,7 @@ for v in nu, xx in x
     xx *= v
     sf = SpecialFunctions.bessely(v, xx)
     @test isapprox(bessely(v, xx), sf, rtol=5e-12)
-    @test isapprox(Bessels.besseljy_positive_args(v, xx)[2], SpecialFunctions.bessely(v, xx), rtol=5e-12)
+    @test isapprox(Bessels.BesselFunctions.besseljy_positive_args(v, xx)[2], SpecialFunctions.bessely(v, xx), rtol=5e-12)
     @test isapprox(bessely(Float32(v), Float32(xx)), Float32(sf))
 end
 
diff --git a/test/gamma_test.jl b/test/gamma_test.jl
index d7ff6bf..5f8e8f7 100644
--- a/test/gamma_test.jl
+++ b/test/gamma_test.jl
@@ -1,16 +1,16 @@
 for (T, max, rtol) in ((Float16, 13, 1.0), (Float32, 43, 1.0), (Float64, 170, 7))
-    v = rand(T, 10000)*max
+    local v = rand(T, 10000)*max
     for x in v
-        @test isapprox(T(SpecialFunctions.gamma(widen(x))), Bessels.gamma(x), rtol=rtol*eps(T))
+        @test isapprox(T(SpecialFunctions.gamma(widen(x))), gamma(x), rtol=rtol*eps(T))
         if isinteger(x) && x != 0
-            @test_throws DomainError Bessels.gamma(-x)
+            @test_throws DomainError gamma(-x)
         else
-            @test isapprox(T(SpecialFunctions.gamma(widen(-x))), Bessels.gamma(-x), atol=nextfloat(T(0.),2), rtol=rtol*eps(T))
+            @test isapprox(T(SpecialFunctions.gamma(widen(-x))), gamma(-x), atol=nextfloat(T(0.),2), rtol=rtol*eps(T))
         end
     end
-    @test isnan(Bessels.gamma(T(NaN)))
-    @test isinf(Bessels.gamma(T(Inf)))
+    @test isnan(gamma(T(NaN)))
+    @test isinf(gamma(T(Inf)))
 end
 
 x = [0, 1, 2, 3, 8, 15, 20, 30]
-@test SpecialFunctions.gamma.(x) ≈ Bessels.gamma.(x)
+@test SpecialFunctions.gamma.(x) ≈ gamma.(x)
diff --git a/test/runtests.jl b/test/runtests.jl
index 6d713e0..8452f2b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,12 +2,12 @@ using Bessels
 using Test
 import SpecialFunctions
 
-#@time @testset "besseli" begin include("besseli_test.jl") end
-#@time @testset "besselk" begin include("besselk_test.jl") end
-#@time @testset "besselj" begin include("besselj_test.jl") end
-#@time @testset "bessely" begin include("bessely_test.jl") end
-#@time @testset "hankel" begin include("hankel_test.jl") end
-#@time @testset "gamma" begin include("gamma_test.jl") end
+@time @testset "besseli" begin include("besseli_test.jl") end
+@time @testset "besselk" begin include("besselk_test.jl") end
+@time @testset "besselj" begin include("besselj_test.jl") end
+@time @testset "bessely" begin include("bessely_test.jl") end
+@time @testset "hankel" begin include("hankel_test.jl") end
+@time @testset "gamma" begin include("gamma_test.jl") end
 @time @testset "airy" begin include("airy_test.jl") end
-#@time @testset "sphericalbessel" begin include("sphericalbessel_test.jl") end
-#@time @testset "SIMDMath" begin include("SIMDMath_test.jl") end
+@time @testset "sphericalbessel" begin include("sphericalbessel_test.jl") end
+@time @testset "SIMDMath" begin include("SIMDMath_test.jl") end
diff --git a/test/sphericalbessel_test.jl b/test/sphericalbessel_test.jl
index 1461ac7..7834b00 100644
--- a/test/sphericalbessel_test.jl
+++ b/test/sphericalbessel_test.jl
@@ -1,120 +1,120 @@
 # test very small inputs
 x = 1e-15
-@test @inferred(Bessels.sphericalbesselj(0, x)) ≈ SpecialFunctions.sphericalbesselj(0, x)
-@test @inferred(Bessels.sphericalbesselj(1, x)) ≈ SpecialFunctions.sphericalbesselj(1, x)
-@test @inferred(Bessels.sphericalbesselj(5.5, x)) ≈ SpecialFunctions.sphericalbesselj(5.5, x)
-@test @inferred(Bessels.sphericalbesselj(10, x)) ≈ SpecialFunctions.sphericalbesselj(10, x)
-@test @inferred(Bessels.sphericalbessely(0, x)) ≈ SpecialFunctions.sphericalbessely(0, x)
-@test @inferred(Bessels.sphericalbessely(1, x)) ≈ SpecialFunctions.sphericalbessely(1, x)
-@test @inferred(Bessels.sphericalbessely(5.5, x)) ≈ SpecialFunctions.sphericalbessely(5.5, x)
-@test @inferred(Bessels.sphericalbessely(10, x)) ≈ SpecialFunctions.sphericalbessely(10, x)
-@test @inferred(Bessels.sphericalbesselk(5.5, x)) ≈ SpecialFunctions.besselk(5.5 + 1/2, x) * sqrt( 2 / (x*pi))
-@test @inferred(Bessels.sphericalbesselk(10, x)) ≈ SpecialFunctions.besselk(10 + 1/2, x) * sqrt( 2 / (x*pi))
+@test @inferred(sphericalbesselj(0, x)) ≈ SpecialFunctions.sphericalbesselj(0, x)
+@test @inferred(sphericalbesselj(1, x)) ≈ SpecialFunctions.sphericalbesselj(1, x)
+@test @inferred(sphericalbesselj(5.5, x)) ≈ SpecialFunctions.sphericalbesselj(5.5, x)
+@test @inferred(sphericalbesselj(10, x)) ≈ SpecialFunctions.sphericalbesselj(10, x)
+@test @inferred(sphericalbessely(0, x)) ≈ SpecialFunctions.sphericalbessely(0, x)
+@test @inferred(sphericalbessely(1, x)) ≈ SpecialFunctions.sphericalbessely(1, x)
+@test @inferred(sphericalbessely(5.5, x)) ≈ SpecialFunctions.sphericalbessely(5.5, x)
+@test @inferred(sphericalbessely(10, x)) ≈ SpecialFunctions.sphericalbessely(10, x)
+@test @inferred(sphericalbesselk(5.5, x)) ≈ SpecialFunctions.besselk(5.5 + 1/2, x) * sqrt( 2 / (x*pi))
+@test @inferred(sphericalbesselk(10, x)) ≈ SpecialFunctions.besselk(10 + 1/2, x) * sqrt( 2 / (x*pi))
 
 x = 1e-20
-@test @inferred(Bessels.sphericalbesseli(0, x)) ≈ SpecialFunctions.besseli(0 + 1/2, x) * sqrt( pi / (x*2))
-@test @inferred(Bessels.sphericalbesseli(1, x)) ≈ SpecialFunctions.besseli(1 + 1/2, x) * sqrt( pi / (x*2))
-@test @inferred(Bessels.sphericalbesseli(2, x)) ≈ SpecialFunctions.besseli(2 + 1/2, x) * sqrt( pi / (x*2))
-@test @inferred(Bessels.sphericalbesseli(3, x)) ≈ SpecialFunctions.besseli(3 + 1/2, x) * sqrt( pi / (x*2))
-@test @inferred(Bessels.sphericalbesseli(4, x)) ≈ SpecialFunctions.besseli(4 + 1/2, x) * sqrt( pi / (x*2))
-@test @inferred(Bessels.sphericalbesseli(6.5, x)) ≈ SpecialFunctions.besseli(6.5 + 1/2, x) * sqrt( pi / (x*2))
+@test @inferred(sphericalbesseli(0, x)) ≈ SpecialFunctions.besseli(0 + 1/2, x) * sqrt( pi / (x*2))
+@test @inferred(sphericalbesseli(1, x)) ≈ SpecialFunctions.besseli(1 + 1/2, x) * sqrt( pi / (x*2))
+@test @inferred(sphericalbesseli(2, x)) ≈ SpecialFunctions.besseli(2 + 1/2, x) * sqrt( pi / (x*2))
+@test @inferred(sphericalbesseli(3, x)) ≈ SpecialFunctions.besseli(3 + 1/2, x) * sqrt( pi / (x*2))
+@test @inferred(sphericalbesseli(4, x)) ≈ SpecialFunctions.besseli(4 + 1/2, x) * sqrt( pi / (x*2))
+@test @inferred(sphericalbesseli(6.5, x)) ≈ SpecialFunctions.besseli(6.5 + 1/2, x) * sqrt( pi / (x*2))
 
 # test zero
-@test isone(Bessels.sphericalbesselj(0, 0.0))
-@test iszero(Bessels.sphericalbesselj(3, 0.0))
-@test iszero(Bessels.sphericalbesselj(10.4, 0.0))
-@test iszero(Bessels.sphericalbesselj(100.6, 0.0))
+@test isone(sphericalbesselj(0, 0.0))
+@test iszero(sphericalbesselj(3, 0.0))
+@test iszero(sphericalbesselj(10.4, 0.0))
+@test iszero(sphericalbesselj(100.6, 0.0))
 
-@test Bessels.sphericalbessely(0, 0.0) == -Inf
-@test Bessels.sphericalbessely(1.8, 0.0) == -Inf
-@test Bessels.sphericalbessely(10, 0.0) == -Inf
-@test Bessels.sphericalbessely(290, 0.0) == -Inf
+@test sphericalbessely(0, 0.0) == -Inf
+@test sphericalbessely(1.8, 0.0) == -Inf
+@test sphericalbessely(10, 0.0) == -Inf
+@test sphericalbessely(290, 0.0) == -Inf
 
-@test isinf(Bessels.sphericalbesselk(0, 0.0))
-@test isinf(Bessels.sphericalbesselk(4, 0.0))
-@test isinf(Bessels.sphericalbesselk(10.2, 0.0))
+@test isinf(sphericalbesselk(0, 0.0))
+@test isinf(sphericalbesselk(4, 0.0))
+@test isinf(sphericalbesselk(10.2, 0.0))
 
 x = 0.0
-@test isone(Bessels.sphericalbesseli(0, x))
-@test iszero(Bessels.sphericalbesseli(1, x))
-@test iszero(Bessels.sphericalbesseli(2, x))
-@test iszero(Bessels.sphericalbesseli(3, x))
-@test iszero(Bessels.sphericalbesseli(4, x))
-@test iszero(Bessels.sphericalbesseli(6.4, x))
+@test isone(sphericalbesseli(0, x))
+@test iszero(sphericalbesseli(1, x))
+@test iszero(sphericalbesseli(2, x))
+@test iszero(sphericalbesseli(3, x))
+@test iszero(sphericalbesseli(4, x))
+@test iszero(sphericalbesseli(6.4, x))
 
 # test Inf
-@test iszero(Bessels.sphericalbesselj(1, Inf))
-@test iszero(Bessels.sphericalbesselj(10.2, Inf))
-@test iszero(Bessels.sphericalbessely(3, Inf))
-@test iszero(Bessels.sphericalbessely(4.5, Inf))
+@test iszero(sphericalbesselj(1, Inf))
+@test iszero(sphericalbesselj(10.2, Inf))
+@test iszero(sphericalbessely(3, Inf))
+@test iszero(sphericalbessely(4.5, Inf))
 
-@test iszero(Bessels.sphericalbesselk(0, Inf))
-@test iszero(Bessels.sphericalbesselk(4, Inf))
-@test iszero(Bessels.sphericalbesselk(10.2, Inf))
+@test iszero(sphericalbesselk(0, Inf))
+@test iszero(sphericalbesselk(4, Inf))
+@test iszero(sphericalbesselk(10.2, Inf))
 
 x = Inf
-@test isinf(Bessels.sphericalbesseli(0, x))
-@test isinf(Bessels.sphericalbesseli(1, x))
-@test isinf(Bessels.sphericalbesseli(2, x))
-@test isinf(Bessels.sphericalbesseli(3, x))
-@test isinf(Bessels.sphericalbesseli(4, x))
-@test isinf(Bessels.sphericalbesseli(6.4, x))
+@test isinf(sphericalbesseli(0, x))
+@test isinf(sphericalbesseli(1, x))
+@test isinf(sphericalbesseli(2, x))
+@test isinf(sphericalbesseli(3, x))
+@test isinf(sphericalbesseli(4, x))
+@test isinf(sphericalbesseli(6.4, x))
 
 # test NaN
-@test isnan(Bessels.sphericalbesselj(1.4, NaN))
-@test isnan(Bessels.sphericalbesselj(4.0, NaN))
-@test isnan(Bessels.sphericalbessely(1.4, NaN))
-@test isnan(Bessels.sphericalbessely(4.0, NaN))
+@test isnan(sphericalbesselj(1.4, NaN))
+@test isnan(sphericalbesselj(4.0, NaN))
+@test isnan(sphericalbessely(1.4, NaN))
+@test isnan(sphericalbessely(4.0, NaN))
 
-@test isnan(Bessels.sphericalbesselk(1.4, NaN))
-@test isnan(Bessels.sphericalbesselk(4.0, NaN))
+@test isnan(sphericalbesselk(1.4, NaN))
+@test isnan(sphericalbesselk(4.0, NaN))
 
 x = NaN
-@test isnan(Bessels.sphericalbesseli(0, x))
-@test isnan(Bessels.sphericalbesseli(1, x))
-@test isnan(Bessels.sphericalbesseli(2, x))
-@test isnan(Bessels.sphericalbesseli(3, x))
-@test isnan(Bessels.sphericalbesseli(4, x))
-@test isnan(Bessels.sphericalbesseli(6.4, x))
+@test isnan(sphericalbesseli(0, x))
+@test isnan(sphericalbesseli(1, x))
+@test isnan(sphericalbesseli(2, x))
+@test isnan(sphericalbesseli(3, x))
+@test isnan(sphericalbesseli(4, x))
+@test isnan(sphericalbesseli(6.4, x))
 
 # test Float16, Float32 types
-@test @inferred(Bessels.sphericalbesselj(Float16(1.4), Float16(1.2))) isa Float16
-@test @inferred(Bessels.sphericalbessely(Float16(1.4), Float16(1.2))) isa Float16
-@test @inferred(Bessels.sphericalbesselj(1.4f0, 1.2f0)) isa Float32
-@test @inferred(Bessels.sphericalbessely(1.4f0, 1.2f0)) isa Float32
+@test @inferred(sphericalbesselj(Float16(1.4), Float16(1.2))) isa Float16
+@test @inferred(sphericalbessely(Float16(1.4), Float16(1.2))) isa Float16
+@test @inferred(sphericalbesselj(1.4f0, 1.2f0)) isa Float32
+@test @inferred(sphericalbessely(1.4f0, 1.2f0)) isa Float32
 
-@test Bessels.sphericalbesselk(Float16(1.4), Float16(1.2)) isa Float16
-@test Bessels.sphericalbesselk(1.0f0, 1.2f0) isa Float32
+@test sphericalbesselk(Float16(1.4), Float16(1.2)) isa Float16
+@test sphericalbesselk(1.0f0, 1.2f0) isa Float32
 
-@test Bessels.sphericalbesseli(Float16(1.4), Float16(1.2)) isa Float16
-@test Bessels.sphericalbesseli(1.0f0, 1.2f0) isa Float32
+@test sphericalbesseli(Float16(1.4), Float16(1.2)) isa Float16
+@test sphericalbesseli(1.0f0, 1.2f0) isa Float32
 
 for x in 0.5:1.5:100.0, v in [0, 1, 2, 3, 4, 5.5, 8.2, 10]
-    @test isapprox(Bessels.sphericalbesselj(v, x), SpecialFunctions.sphericalbesselj(v, x), rtol=1e-12)
-    @test isapprox(Bessels.sphericalbessely(v, x), SpecialFunctions.sphericalbessely(v, x), rtol=1e-12)
-    @test isapprox(Bessels.sphericalbesselk(v, x), SpecialFunctions.besselk(v+1/2, x) * sqrt( 2 / (x*pi)), rtol=1e-12)
-    @test isapprox(Bessels.sphericalbesseli(v, x), SpecialFunctions.besseli(v+1/2, x) * sqrt( pi / (x*2)), rtol=1e-12)
+    @test isapprox(sphericalbesselj(v, x), SpecialFunctions.sphericalbesselj(v, x), rtol=1e-12)
+    @test isapprox(sphericalbessely(v, x), SpecialFunctions.sphericalbessely(v, x), rtol=1e-12)
+    @test isapprox(sphericalbesselk(v, x), SpecialFunctions.besselk(v+1/2, x) * sqrt( 2 / (x*pi)), rtol=1e-12)
+    @test isapprox(sphericalbesseli(v, x), SpecialFunctions.besseli(v+1/2, x) * sqrt( pi / (x*2)), rtol=1e-12)
 end
 
 for x in 5.5:4.0:160.0, v in [20, 25.0, 32.4, 40.0, 45.12, 50.0, 55.2, 60.124, 70.23, 75.0, 80.0, 92.3, 100.0, 120.0]
-    @test isapprox(Bessels.sphericalbesselj(v, x), SpecialFunctions.sphericalbesselj(v, x), rtol=3e-12)
-    @test isapprox(Bessels.sphericalbessely(v, x), SpecialFunctions.sphericalbessely(v, x), rtol=3e-12)
-    @test isapprox(Bessels.sphericalbesselk(v, x), SpecialFunctions.besselk(v+1/2, x) * sqrt( 2 / (x*pi)), rtol=1e-12)
-    @test isapprox(Bessels.sphericalbesseli(v, x), SpecialFunctions.besseli(v+1/2, x) * sqrt( pi / (x*2)), rtol=1e-12)
+    @test isapprox(sphericalbesselj(v, x), SpecialFunctions.sphericalbesselj(v, x), rtol=3e-12)
+    @test isapprox(sphericalbessely(v, x), SpecialFunctions.sphericalbessely(v, x), rtol=3e-12)
+    @test isapprox(sphericalbesselk(v, x), SpecialFunctions.besselk(v+1/2, x) * sqrt( 2 / (x*pi)), rtol=1e-12)
+    @test isapprox(sphericalbesseli(v, x), SpecialFunctions.besseli(v+1/2, x) * sqrt( pi / (x*2)), rtol=1e-12)
 end
 
-@test isapprox(Bessels.sphericalbessely(270, 240.0), SpecialFunctions.sphericalbessely(270, 240.0), rtol=3e-12)
+@test isapprox(sphericalbessely(270, 240.0), SpecialFunctions.sphericalbessely(270, 240.0), rtol=3e-12)
 
 v, x = -4.0, 5.6
-@test isapprox(Bessels.sphericalbesselj(v, x), 0.07774965105230025584537, rtol=3e-12)
-@test isapprox(Bessels.sphericalbessely(v, x), -0.1833997131521190346258, rtol=3e-12)
+@test isapprox(sphericalbesselj(v, x), 0.07774965105230025584537, rtol=3e-12)
+@test isapprox(sphericalbessely(v, x), -0.1833997131521190346258, rtol=3e-12)
 
 v, x = -6.8, 15.6
-@test isapprox(Bessels.sphericalbesselj(v, x), 0.04386355397884301866595, rtol=3e-12)
-@test isapprox(Bessels.sphericalbessely(v, x), 0.05061013363904335437354, rtol=3e-12)
+@test isapprox(sphericalbesselj(v, x), 0.04386355397884301866595, rtol=3e-12)
+@test isapprox(sphericalbessely(v, x), 0.05061013363904335437354, rtol=3e-12)
 
 # test for negative order of spherical modified besselk in the special integer
 # routine:
 for v in 1:10
-  @test Bessels.sphericalbesselk(-v, 1.1) ≈ Bessels.sphericalbesselk(v-1, 1.1)
+  @test sphericalbesselk(-v, 1.1) ≈ sphericalbesselk(v-1, 1.1)
 end

From 52001b513690f147e2a681a91e43aaca0ee6759a Mon Sep 17 00:00:00 2001
From: Michael Helton <heltonmc@protonmail.com>
Date: Fri, 17 Mar 2023 17:17:22 -0400
Subject: [PATCH 14/14] remove SIMDmath

---
 src/Bessels.jl             |   1 -
 src/SIMDMath/SIMDMath.jl   |  34 ----------
 src/SIMDMath/arithmetic.jl |  53 ---------------
 src/SIMDMath/horner.jl     | 135 -------------------------------------
 src/SIMDMath/readme.md     | 126 ----------------------------------
 test/SIMDMath_test.jl      | 112 ------------------------------
 test/runtests.jl           |   1 -
 7 files changed, 462 deletions(-)
 delete mode 100644 src/SIMDMath/SIMDMath.jl
 delete mode 100644 src/SIMDMath/arithmetic.jl
 delete mode 100644 src/SIMDMath/horner.jl
 delete mode 100644 src/SIMDMath/readme.md
 delete mode 100644 test/SIMDMath_test.jl

diff --git a/src/Bessels.jl b/src/Bessels.jl
index 3fb07c6..9208ab3 100644
--- a/src/Bessels.jl
+++ b/src/Bessels.jl
@@ -44,7 +44,6 @@ export gamma
 
 const ComplexOrReal{T} = Union{T,Complex{T}}
 
-include("SIMDMath/SIMDMath.jl")
 include("Math/Math.jl")
 include("GammaFunctions/GammaFunctions.jl")
 include("BesselFunctions/BesselFunctions.jl")
diff --git a/src/SIMDMath/SIMDMath.jl b/src/SIMDMath/SIMDMath.jl
deleted file mode 100644
index 08efbc4..0000000
--- a/src/SIMDMath/SIMDMath.jl
+++ /dev/null
@@ -1,34 +0,0 @@
-#
-# SIMDMath aims to provide vectorized basic math operations to be used in static fully unrolled functions such as computing special functions.
-#
-# The type system is heavily influenced by SIMD.jl (https://github.com/eschnett/SIMD.jl) licensed under the Simplified "2-clause" BSD License:
-# Copyright (c) 2016-2020: Erik Schnetter, Kristoffer Carlsson, Julia Computing All rights reserved.
-#
-# This module is also influenced by VectorizationBase.jl (https://github.com/JuliaSIMD/VectorizationBase.jl) licensed under the MIT License: Copyright (c) 2018 Chris Elrod
-#
-
-module SIMDMath
-
-using Base: llvmcall, VecElement
-
-export horner_simd, pack_horner
-export horner, horner2, horner4, horner8
-export pack_horner2, pack_horner4, pack_horner8
-
-export clenshaw_simd
-
-const VE = Base.VecElement
-const FloatTypes = Union{Float16, Float32, Float64}
-const LVec{N, FloatTypes} = NTuple{N, VE{FloatTypes}}
-const ScalarTypes = Union{VE{FloatTypes}, FloatTypes}
-
-const LLVMType = Dict{DataType, String}(
-    Float16  => "half",
-    Float32  => "float",
-    Float64  => "double",
-)
-
-include("arithmetic.jl")
-include("horner.jl")
-
-end
diff --git a/src/SIMDMath/arithmetic.jl b/src/SIMDMath/arithmetic.jl
deleted file mode 100644
index 0612d01..0000000
--- a/src/SIMDMath/arithmetic.jl
+++ /dev/null
@@ -1,53 +0,0 @@
-import Base: muladd
-
-## ShuffleVector
-
-# create tuples of VecElements filled with a constant value of x
-@inline constantvector(x::T, y) where T <: FloatTypes = constantvector(VE(x), y)
-
-@inline @generated function constantvector(x::VecElement{T}, y::Type{LVec{N, T}}) where {N, T <: FloatTypes}
-    s = """
-        %2 = insertelement <$N x $(LLVMType[T])> undef, $(LLVMType[T]) %0, i32 0
-        %3 = shufflevector <$N x $(LLVMType[T])> %2, <$N x $(LLVMType[T])> undef, <$N x i32> zeroinitializer
-        ret <$N x $(LLVMType[T])> %3
-        """
-    return :(
-        llvmcall($s, LVec{N, T}, Tuple{VecElement{T}}, x)
-        )
-end
-
-## muladd
-
-@inline muladd(x::LVec{N, T}, y::LVec{N, T}, z::LVec{N, T}) where {N, T <: FloatTypes} = _muladd(x, y, z)
-@inline muladd(x::ScalarTypes, y::LVec{N, T}, z::LVec{N, T}) where {N, T <: FloatTypes} = muladd(constantvector(x, LVec{N, T}), y, z)
-@inline muladd(x::LVec{N, T}, y::ScalarTypes, z::LVec{N, T}) where {N, T <: FloatTypes} = muladd(x, constantvector(y, LVec{N, T}), z)
-@inline muladd(x::ScalarTypes, y::ScalarTypes, z::LVec{N, T}) where {N, T <: FloatTypes} = muladd(constantvector(x, LVec{N, T}), constantvector(y, LVec{N, T}), z)
-@inline muladd(x::LVec{N, T}, y::LVec{N, T}, z::ScalarTypes) where {N, T <: FloatTypes} = muladd(x, y, constantvector(z, LVec{N, T}))
-@inline muladd(x::ScalarTypes, y::LVec{N, T}, z::ScalarTypes) where {N, T <: FloatTypes} = muladd(constantvector(x, LVec{N, T}), y, constantvector(z, LVec{N, T}))
-@inline muladd(x::LVec{N, T}, y::ScalarTypes, z::ScalarTypes) where {N, T <: FloatTypes} = muladd(x, constantvector(y, LVec{N, T}), constantvector(z, LVec{N, T}))
-
-# muladd llvm instructions
-
-@inline @generated function _muladd(x::LVec{N, T}, y::LVec{N, T}, z::LVec{N, T}) where {N, T <: FloatTypes}
-    s = """
-        %4 = fmul contract <$N x $(LLVMType[T])> %0, %1
-        %5 = fadd contract <$N x $(LLVMType[T])> %4, %2
-        ret <$N x $(LLVMType[T])> %5
-        """
-    return :(
-        llvmcall($s, LVec{N, T}, Tuple{LVec{N, T}, LVec{N, T}, LVec{N, T}}, x, y, z)
-        )
-end
-
-for f in (:fadd, :fsub, :fmul, :fdiv)
-    @eval @inline @generated function $f(x::LVec{N, T}, y::LVec{N, T}) where {N, T <: FloatTypes}
-        ff = $(QuoteNode(f))
-        s = """
-        %3 = $ff <$N x $(LLVMType[T])> %0, %1
-        ret <$N x $(LLVMType[T])> %3
-        """
-        return :(
-        llvmcall($s, LVec{N, T}, Tuple{LVec{N, T}, LVec{N, T}}, x, y)
-        )
-    end
-end
diff --git a/src/SIMDMath/horner.jl b/src/SIMDMath/horner.jl
deleted file mode 100644
index 646be55..0000000
--- a/src/SIMDMath/horner.jl
+++ /dev/null
@@ -1,135 +0,0 @@
-#
-# Polynomial evaluation using Horner's scheme.
-#
-# File contains methods to compute several different polynomials at once using SIMD with Horner's scheme (horner_simd).
-# Additional methods to evaluate a single polynomial using SIMD with different order Horner's scheme (horner2, horner4, horner8).
-# TODO: Extend to Float32
-# TODO: Support complex numbers
-
-#
-# Parallel Horner's scheme to evaluate 2, 4, or 8 polynomials in parallel using SIMD.
-#
-# Usage:
-# x = 1.1
-# P1 = (1.1, 1.2, 1.4, 1.5)
-# P2 = (1.3, 1.3, 1.6, 1.8)
-# a = horner_simd(x, pack_horner(P1, P2))
-# a[1].value == evalpoly(x, P1)
-# a[2].value == evalpoly(x, P2)
-#
-# Note the strict equality as this method doesn't alter the order of individual polynomial evaluations
-#
-
-# cast single element `x` to a width of the number of polynomials
-@inline horner_simd(x::Union{T, VE{T}}, p::NTuple{N, LVec{M, T}}) where {N, M, T} = horner_simd(constantvector(x, LVec{M, T}), p)
-
-@inline function horner_simd(x::LVec{M, T}, p::NTuple{N, LVec{M, T}}) where {N, M, T <: FloatTypes}
-    a = p[end]
-    @inbounds for i in N-1:-1:1
-        a = muladd(x, a, p[i])
-    end
-    return a
-end
-
-@inline pack_horner(P::Tuple{Vararg{NTuple{M, T}, N}}) where {N, M, T} = ntuple(i -> LVec{N, T}((ntuple(j -> P[j][i], Val(N)))), Val(M))
-
-# need to add multiply/add/subtract commands
-# finish of clenshaw_simd algorithm
-@inline function clenshaw_simd(x::T, c::NTuple{N, LVec{M, T}}) where {N, M, T <: FloatTypes}
-    x2 = constantvector(2*x, LVec{M, T})
-    c0 = c[end-1]
-    c1 = c[end]
-    @inbounds for i in length(c)-2:-1:1
-        c0, c1 = fsub(c[i], c1), muladd(x2, c1, c0)
-    end
-    return muladd(x, c1, c0)
-end
-
-#
-# Horner scheme for evaluating single polynomials
-#
-# In some cases we are interested in speeding up the evaluation of a single polynomial of high degree.
-# It is possible to split the polynomial into pieces and use SIMD to convert each piece simultaneously
-#
-# Usage:
-# x = 1.1
-# poly = (1.0, 0.5, 0.1, 0.05, 0.1)
-# evalpoly(x, poly) ≈ horner2(x, pack_horner2(poly)) ≈ horner4(x, pack_horner4(poly)) ≈ horner8(x, pack_horner8(poly))
-#
-# Note the approximative relation between each as floating point arithmetic is associative.
-# Here, we are adding up the polynomial in different degrees so there will be a few ULPs difference
-# 
-
-# horner - default regular horner to base evalpoly
-# horner2 - 2nd horner scheme that splits polynomial into even/odd powers
-# horner4 - 4th order horner scheme that splits into a + ex^4 + ... & b + fx^5 ... & etc
-# horner8 - 8th order horner scheme that splits into a + ix^8 + .... & etc
-
-@inline horner(x::T, P::NTuple{N, T}) where {N, T <: FloatTypes} = evalpoly(x, P)
-@inline horner2(x, P::NTuple{N, T}) where {N, T <: FloatTypes} = horner2(x, pack_horner2(P))
-@inline horner4(x, P::NTuple{N, T}) where {N, T <: FloatTypes} = horner4(x, pack_horner4(P))
-@inline horner8(x, P::NTuple{N, T}) where {N, T <: FloatTypes} = horner8(x, pack_horner8(P))
-
-@inline function horner2(x, P::NTuple{N, LVec{2, T}}) where {N, T <: FloatTypes}
-    a = horner_simd(x * x, P)
-    return muladd(x, a[2].value, a[1].value)
-end
-
-@inline function horner4(x, P::NTuple{N, LVec{4, T}}) where {N, T <: FloatTypes}
-    xx = x * x
-    a = horner_simd(xx * xx, P)
-    b = muladd(x, LVec{2, T}((a[4].value, a[2].value)), LVec{2, T}((a[3].value, a[1].value)))
-    return muladd(xx, b[1].value, b[2].value) 
-end
-
-@inline function horner8(x, P::NTuple{N, LVec{8, T}}) where {N, T <: FloatTypes}
-    x2 = x * x
-    x4 = x2 * x2
-    a = horner_simd(x4 * x4, P)
-
-    # following computes
-    # a[1].value + a[2].value*x + a[3].value*x^2 + a[4].value*x^3 + a[5].value*x^4 + a[6].value*x^5 + a[7].value*x^6 + a[8].value*x^7
-
-    b = muladd(x, LVec{4, T}((a[4].value, a[2].value, a[8].value, a[6].value)), LVec{4, T}((a[3].value, a[1].value, a[7].value, a[5].value)))
-    c = muladd(x2, LVec{2, T}((b[1].value, b[3].value)), LVec{2, T}((b[2].value, b[4].value)))
-    return muladd(x4, c[2].value, c[1].value)
-end
-
-#
-# Following packs coefficients for second, fourth, and eighth order horner evaluations.
-# Accepts a single polynomial that will pack into proper coefficients
-#
-# Usage:
-# x = 1.1
-# poly = (1.0, 0.5, 0.1, 0.05, 0.1)
-# evalpoly(x, poly) ≈ horner2(x, pack_horner2(poly)) ≈ horner4(x, pack_horner4(poly)) ≈ horner8(x, pack_horner8(poly))
-#
-# Note: these functions will pack statically known polynomials for N <= 32 at compile time.
-# If length(poly) > 32 they must be packed and declared as constants otherwise packing will allocate at runtime
-# Example:
-# const P = pack_horner2(ntuple(i -> i*0.1, 35))
-# horner2(x, P)
-#
-# TODO: Use a generated function instead might make the packing more reliable for all polynomial degrees
-#
-
-@inline function pack_horner2(p::NTuple{N, T}) where {N, T <: FloatTypes}
-    rem = N % 2
-    pad = !iszero(rem) ? (2 - rem) : 0
-    P = (p..., ntuple(i -> zero(T), Val(pad))...)
-    return ntuple(i -> LVec{2, T}((P[2i - 1], P[2i])), Val((N + pad) ÷ 2))
-end
-
-@inline function pack_horner4(p::NTuple{N, T}) where {N, T <: FloatTypes}
-    rem = N % 4
-    pad = !iszero(rem) ? (4 - rem) : 0
-    P = (p..., ntuple(i -> zero(T), Val(pad))...)
-    return ntuple(i -> LVec{4, T}((P[4i - 3], P[4i - 2], P[4i - 1], P[4i])), Val((N + pad) ÷ 4))
-end
-
-@inline function pack_horner8(p::NTuple{N, T}) where {N, T <: FloatTypes}
-    rem = N % 8
-    pad = !iszero(rem) ? (8 - rem) : 0
-    P = (p..., ntuple(i -> zero(T), Val(pad))...)
-    return ntuple(i -> LVec{8, T}((P[8i - 7], P[8i - 6], P[8i - 5], P[8i - 4], P[8i - 3], P[8i - 2], P[8i - 1], P[8i])), Val((N + pad) ÷ 8))
-end
diff --git a/src/SIMDMath/readme.md b/src/SIMDMath/readme.md
deleted file mode 100644
index 9330211..0000000
--- a/src/SIMDMath/readme.md
+++ /dev/null
@@ -1,126 +0,0 @@
-# SIMDMath.jl
-
-A lightweight module for explicit vectorization of simple math functions. The focus is mainly on vectorizing polynomial evaluation in two main cases: (1) evaluating many different polynomials of similar length and (2) evaluating a single large polynomial.
-This module is for statically known functions where the coefficients are unrolled and the size of the tuples is known at compilation. For more advanced needs it will be better to use SIMD.jl or LoopVectorization.jl.
-
-### Case 1: Evaluating many different polynomials.
-
-In the evaluation of special functions, we often need to compute many polynomials at the same `x`. An example structure would look like...
-```julia
-const NT = 12
-const P = (
-           ntuple(n -> rand()*(-1)^n / n, NT),
-           ntuple(n -> rand()*(-1)^n / n, NT),
-           ntuple(n -> rand()*(-1)^n / n, NT),
-           ntuple(n -> rand()*(-1)^n / n, NT)
-       )
-
-function test(x)
-           x2 = x * x
-           x3 = x2 * x
-
-           p1 = evalpoly(x3, P[1])
-           p2 = evalpoly(x3, P[2])
-           p3 = evalpoly(x3, P[3])
-           p4 = evalpoly(x3, P[4])
-
-           return muladd(x, -p2, p1), muladd(x2, p4, -p3)
-       end
-```
-This structure is advantageous for vectorizing as `p1`, `p2`, `p3`, and `p4` are independent, require same number of evaluations, and coefficients are statically known.
-However, we are relying on the auto-vectorizer to make sure this happens which is very fragile. In general, two polynomials might auto-vectorizer depending on how the values are used but is not reliable.
-We can check that this function is not vectorizing (though it may on some architectures) by using `@code_llvm test(1.1)` and/or `@code_native(1.1)`.
-
-Another way to test this is to benchmark this function and compare to the time to compute a single polynomial.
-```julia
-julia> @btime test(x) setup=(x=rand()*2)
-  13.026 ns (0 allocations: 0 bytes)
-
-julia> @btime evalpoly(x, P[1]) setup=(x=rand()*2)
-  3.973 ns (0 allocations: 0 bytes)
-```
-In this case, `test` is almost 4x longer as all the polynomial evaluations are happening sequentially.
-
-We can do much better by making sure these polynomials vectorize.
-```julia
-# using the same coefficients as above
-julia> using Bessels.SIMDMath
-
-@inline function test_simd(x)
-       x2 = x * x
-       x3 = x2 * x
-       p = horner_simd(x3, pack_horner(P...))
-       return muladd(x, -p[2].value, p[1].value), muladd(x2, p[4].value, -p[3].value)
-end
-
-julia> @btime test_simd(x) setup=(x=rand()*2)
-  4.440 ns (0 allocations: 0 bytes)
-```
-
-### Case 2: Evaluating a single polynomial.
-
-In some cases, we are interested in improving the performance when evaluating a single polynomial of larger degree. Horner's scheme is latency bound and for large polynomials (N>10) this can become a large part of the total runtime. We can test the performance of using a straight Horner scheme using the Base library function `evalpoly` against the higher order Horner schemes.
-```julia
-let
-    horner_times = []
-    horner2_times = []
-    horner4_times = []
-    horner8_times = []
-
-    for N in [4, 8, 12, 16, 32, 64, 128, 256, 512]
-        poly = ntuple(n -> rand()*(-1)^n / n, N)
-        poly_packed2 = pack_horner2(poly)
-        poly_packed4 = pack_horner4(poly)
-        poly_packed8 = pack_horner8(poly)
-
-        t1 = @benchmark evalpoly(x, $poly) setup=(x=rand())
-        t2 = @benchmark horner2(x, $poly_packed2) setup=(x=rand())
-        t3 = @benchmark horner4(x, $poly_packed4) setup=(x=rand())
-        t4 = @benchmark horner8(x, $poly_packed8) setup=(x=rand())
-
-        push!(horner_times,  round(minimum(t1).time, digits=3))
-        push!(horner2_times,  round(minimum(t2).time, digits=3))
-        push!(horner4_times,  round(minimum(t3).time, digits=3))
-        push!(horner8_times,  round(minimum(t4).time, digits=3))
-
-    end
-    @show horner_times ./ horner2_times
-    @show horner_times ./ horner4_times
-    @show horner_times ./ horner8_times
-end
-```
-
-As mentioned, Horner's scheme requires sequential multiply-add instructions that can't be performed in parallel. One way (another way is Estrin's method which we won't discuss) to improve this structure is to break the polynomial down into even and odd polynomials (a second order Horner's scheme) or into larger powers of `x^4` or `x^8` (a fourth and eighth order Horner's scheme) which allow for computing many different polynomials of similar length simultaneously. In some regard, we are just rearranging the coefficients and using the same method as we did in the first case with some additional arithmetic at the end to add all the different polynomials together.
-
-The last fact is important because we are actually increasing the total amount of arithmetic operations needed but increasing by a large amount the number of operations that can happen in parallel. The increased operations make the advantages of this approach less straightforward than the first case which is always superior. The second and perhaps most important point is that floating point arithmetic is not associative so these approaches will give slightly different results as we are adding and multiplying in slightly differnet order.
-
-The above benchmarks are as follows.
-```julia
-# Relative Speedup compared to evalpoly when evaluating a polynomial of N degree
-# N = [4, 8, 12, 16, 32, 64, 128, 256, 512]
-
-# CPU #1
-Julia Version 1.8.2
-Platform Info:
-  OS: macOS (arm64-apple-darwin21.3.0)
-  CPU: 8 × Apple M1
-
-#   N   = [ 4,    8,    12,   16,   32,   64,   128,  256,  512]
-horner2 = [1.00, 1.00, 1.16, 1.16, 2.57, 3.14,  1.59, 1.76, 2.05]
-horner4 = [1.00, 0.98, 1.14, 1.00, 2.57, 4.23,  3.53, 3.87, 4.57]
-horner8 = [0.88, 0.86, 1.00, 1.00, 2.29, 4.91,  6.21, 7.19, 8.45]
-
-# CPU #2
-julia> versioninfo()
-Julia Version 1.8.2
-Commit 36034abf260 (2022-09-29 15:21 UTC)
-Platform Info:
-  OS: Linux (x86_64-linux-gnu)
-  CPU: 12 × Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz
-
-#   N   = [ 4,    8,    12,   16,   32,   64,   128,  256,  512]
-horner2 = [1.00, 1.11, 1.30, 1.37, 2.61, 2.11, 2.09, 2.59, 2.27]
-horner4 = [1.00, 0.93, 1.21, 1.44, 3.30, 5.77, 4.69, 5.41, 5.65]
-horner8 = [0.68, 0.84, 0.95, 1.26, 3.05, 6.37, 6.98, 8.76, 9.45]
-```
-Asymptotically, we can see that the method approaches a 2, 4, and 8x increase respecitively for large degrees, however, for smaller degrees the advanges are more complex. Therefore, it is encouraged to test the performance for individual cases. Of course, this depends on statically knowing the polynomial size during compilation which allows for packing the coefficients in the most efficient way.
diff --git a/test/SIMDMath_test.jl b/test/SIMDMath_test.jl
deleted file mode 100644
index 6c940e3..0000000
--- a/test/SIMDMath_test.jl
+++ /dev/null
@@ -1,112 +0,0 @@
-using Bessels.SIMDMath
-using Test
-
-
-# Test horner_simd
-let
-    NT = 12
-    P = (
-                ntuple(n -> rand()*(-1)^n / n, NT),
-                ntuple(n -> rand()*(-1)^n / n, NT),
-                ntuple(n -> rand()*(-1)^n / n, NT),
-                ntuple(n -> rand()*(-1)^n / n, NT)
-            )
-    
-    x = 0.9
-    a = horner_simd(x, pack_horner(P))
-    @test evalpoly(x, P[1]) == a[1].value
-    @test evalpoly(x, P[2]) == a[2].value
-    @test evalpoly(x, P[3]) == a[3].value
-    @test evalpoly(x, P[4]) == a[4].value
-
-    NT = 24
-    P32 = (
-                ntuple(n -> Float32(rand()*(-1)^n / n), NT),
-                ntuple(n -> Float32(rand()*(-1)^n / n), NT),
-                ntuple(n -> Float32(rand()*(-1)^n / n), NT),
-                ntuple(n -> Float32(rand()*(-1)^n / n), NT),
-                ntuple(n -> Float32(rand()*(-1)^n / n), NT),
-                ntuple(n -> Float32(rand()*(-1)^n / n), NT),
-                ntuple(n -> Float32(rand()*(-1)^n / n), NT),
-                ntuple(n -> Float32(rand()*(-1)^n / n), NT),
-            )
-    
-    x = 1.2f0
-    a = horner_simd(x, pack_horner(P32))
-    @test evalpoly(x, P32[1]) == a[1].value
-    @test evalpoly(x, P32[2]) == a[2].value
-    @test evalpoly(x, P32[3]) == a[3].value
-    @test evalpoly(x, P32[4]) == a[4].value
-    @test evalpoly(x, P32[5]) == a[5].value
-    @test evalpoly(x, P32[6]) == a[6].value
-    @test evalpoly(x, P32[7]) == a[7].value
-    @test evalpoly(x, P32[8]) == a[8].value
-
-    NT = 4
-    P16 = (
-                ntuple(n -> Float16(rand()*(-1)^n / n), NT),
-                ntuple(n -> Float16(rand()*(-1)^n / n), NT),
-                ntuple(n -> Float16(rand()*(-1)^n / n), NT),
-            )
-    
-    x = Float16(0.8)
-    a = horner_simd(x, pack_horner(P16))
-    @test evalpoly(x, P16[1]) ≈ a[1].value
-    @test evalpoly(x, P16[2]) ≈ a[2].value
-    @test evalpoly(x, P16[3]) ≈ a[3].value
-
-end
-
-# test second, fourth, and eighth order horner schemes
-# note that the higher order schemes are more prone to overflow when using lower precision
-# because you have to directly compute x^2, x^4, x^8 before the routine
-let
-    for N in [2, 3, 4, 5, 6, 7, 10, 13, 17, 20, 52, 89], x in [0.1, 0.5, 1.5, 4.2, 45.0]
-        poly = ntuple(n -> rand()*(-1)^n / n, N)
-        @test evalpoly(x, poly) ≈ horner2(x, pack_horner2(poly)) ≈ horner4(x, pack_horner4(poly)) ≈ horner8(x, pack_horner8(poly))
-    end
-    for N in [2, 3, 4, 5, 6, 7, 10], x32 in [0.1f0, 0.8f0, 2.4f0, 8.0f0, 25.0f0]
-        poly32 = ntuple(n -> Float32(rand()*(-1)^n / n), N)
-        @test evalpoly(x32, poly32) ≈ horner2(x32, pack_horner2(poly32)) ≈ horner4(x32, pack_horner4(poly32)) ≈ horner8(x32, pack_horner8(poly32))
-    end
-    for N in [2, 3, 4, 5, 6], x in [0.1, 0.5, 2.2]
-        poly16 = ntuple(n -> Float16(rand()*(-1)^n / n), N)
-        x16 = Float16.(x)
-        @test evalpoly(x16, poly16) ≈ horner2(x16, pack_horner2(poly16)) ≈ horner4(x16, pack_horner4(poly16)) ≈ horner8(x16, pack_horner8(poly16))
-    end
-
-end
-
-# Tests for Clenshaw algorithm to evaluate Chebyshev polynomials
-
-# non-SIMD version
-function clen(x, c)
-    x2 = 2x
-    c0 = c[end-1]
-    c1 = c[end]
-    for i in length(c)-2:-1:1
-        c0, c1 = c[i] - c1, c0 + c1 * x2
-    end
-    return c0 + c1 * x
-end
-
-let
-    for N in [2, 6, 10], x in [0.1, 0.5, 1.5, 4.2, 45.0]
-        P = (
-            ntuple(n -> rand()*(-1)^n / n, N),
-            ntuple(n -> rand()*(-1)^n / n, N),
-            ntuple(n -> rand()*(-1)^n / n, N),
-            ntuple(n -> rand()*(-1)^n / n, N)
-        )
-        # the native code generated between the SIMD and non-simd cases
-        # is slightly different due to the SIMD case always using the fsub instruction
-        # where the non-simd case sometimes chooses to reorder this in the native code generation
-        # some small tests showed the SIMD case ordering was slightly more accurate
-        # the SIMD case using this instruction is also faster than even a single evaluation
-        a = clenshaw_simd(x, pack_horner(P))
-        @test clen(x, P[1]) ≈ a[1].value
-        @test clen(x, P[2]) ≈ a[2].value
-        @test clen(x, P[3]) ≈ a[3].value
-        @test clen(x, P[4]) ≈ a[4].value
-    end
-end
diff --git a/test/runtests.jl b/test/runtests.jl
index 8452f2b..69e6085 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,4 +10,3 @@ import SpecialFunctions
 @time @testset "gamma" begin include("gamma_test.jl") end
 @time @testset "airy" begin include("airy_test.jl") end
 @time @testset "sphericalbessel" begin include("sphericalbessel_test.jl") end
-@time @testset "SIMDMath" begin include("SIMDMath_test.jl") end