ashvardanian · ashvardanian · Oct 31, 2024 · Oct 28, 2024 · Oct 31, 2024 · Nov 1, 2024
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
@@ -1,2 +1,3 @@
 a4022a988287e527757ecc9bc16a4f2e7dc4770e
 750c59f5116a2000507a0cec09db009fd7d31232
+b480b5c3ebddd6de0f8e1c179cdc02f18edbb8ae
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -100,9 +100,11 @@
     "cSpell.words": [
         "allclose",
         "Altra",
+        "astype",
         "Axion",
         "bfloat",
         "bitalg",
+        "bitmask",
         "BLAS",
         "castsi",
         "CBLAS",
@@ -133,6 +135,9 @@
         "Logarithmotechnia",
         "maccs",
         "maskz",
+        "mdindices",
+        "mdspan",
+        "musllinux",
         "napi",
         "ndarray",
         "Needleman",
@@ -168,6 +173,7 @@
         "VNNI",
         "vpopcntdq",
         "Wojciech",
+        "wsum",
         "Wunsch",
         "Zilla"
     ],

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -106,10 +106,14 @@ endif ()
 if (SIMSIMD_BUILD_TESTS)
     add_executable(simsimd_test_compile_time scripts/test.c)
     target_link_libraries(simsimd_test_compile_time simsimd m)
+    add_test(NAME simsimd_test_compile_time COMMAND simsimd_test_compile_time)
 
     add_executable(simsimd_test_run_time scripts/test.c c/lib.c)
     target_compile_definitions(simsimd_test_run_time PRIVATE SIMSIMD_DYNAMIC_DISPATCH=1)
     target_link_libraries(simsimd_test_run_time simsimd m)
+    add_test(NAME simsimd_test_run_time COMMAND simsimd_test_run_time)
+
+    enable_testing()
 endif ()
 
 if (SIMSIMD_BUILD_SHARED)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -63,6 +63,13 @@ cmake -D CMAKE_BUILD_TYPE=Release \
 cmake --build build_release --config Release
 ```
 
+I'd recommend putting the following breakpoints:
+
+- `__asan::ReportGenericError` - to detect illegal memory accesses.
+- `__GI_exit` - to stop at exit points - the end of running any executable.
+- `__builtin_unreachable` - to catch unexpected code paths.
+- `_sz_assert_failure` - to catch StringZilla logic assertions.
+
 ## Python
 
 Testing:
@@ -91,14 +98,14 @@ Benchmarking:
 
 ```sh
 pip install numpy scipy scikit-learn                 # for comparison baselines
-python scripts/bench_vectors.py                      # to run default benchmarks
-python scripts/bench_vectors.py --n 1000 --ndim 1536 # batch size and dimensions
+python scripts/bench_similarity.py                      # to run default benchmarks
+python scripts/bench_similarity.py --n 1000 --ndim 1536 # batch size and dimensions
 ```
 
 You can also benchmark against other libraries, filter the numeric types, and distance metrics:
 
 ```sh
-$ python scripts/bench_vectors.py --help
+$ python scripts/bench_similarity.py --help
 > usage: bench.py [-h] [--ndim NDIM] [-n COUNT]
 >                 [--metric {all,dot,spatial,binary,probability,sparse}]
 >                 [--dtype {all,bin8,int8,uint16,uint32,float16,float32,float64,bfloat16,complex32,complex64,complex128}] 

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -24,18 +24,12 @@ name = "simsimd"
 path = "rust/lib.rs"
 
 [build-dependencies]
-cc = "1.0.83"
-
-
-[[bench]]
-name = "cosine"
-harness = false
-path = "scripts/bench_cosine.rs"
+cc = "1.2.0"
 
 [[bench]]
-name = "sqeuclidean"
+name = "bench_similarity"
 harness = false
-path = "scripts/bench_sqeuclidean.rs"
+path = "scripts/bench_similarity.rs"
 
 [profile.bench]
 opt-level = 3     # Corresponds to -O3
@@ -46,4 +40,5 @@ rpath = false     # On some systems, setting this to false can help with optimiz
 [dev-dependencies]
 criterion = { version = "0.5.1" }
 rand = { version = "0.8.5" }
-half = { version = "2.4.0" }
+half = { version = "2.4.1" }
+num-traits = "0.2.19"
diff --git a/README.md b/README.md
@@ -2,8 +2,8 @@
 
 Computing dot-products, similarity measures, and distances between low- and high-dimensional vectors is ubiquitous in Machine Learning, Scientific Computing, Geo-Spatial Analysis, and Information Retrieval.
 These algorithms generally have linear complexity in time, constant or linear complexity in space, and are data-parallel.
-In other words, it is easily parallelizable and vectorizable and often available in packages like BLAS (level 1) and LAPACK, as well as higher-level `numpy` and `scipy` Python libraries.
-Ironically, even with decades of evolution in compilers and numerical computing, [most libraries can be 3-200x slower than hardware potential][benchmarks] even on the most popular hardware, like 64-bit x86 and Arm CPUs.
+In other words, they are easily parallelizable and vectorizable and often available in packages like BLAS (level 1) and LAPACK, as well as higher-level `numpy` and `scipy` Python libraries.
+Ironically, even with decades of evolution in compilers and numerical computing, [most libraries can be 3x - 1'000x slower than hardware potential][benchmarks] even on the most popular hardware, like 64-bit x86 and Arm CPUs.
 Moreover, most lack mixed-precision support, which is crucial for modern AI!
 The rare few that support minimal mixed precision, run only on one platform, and are vendor-locked, by companies like Intel and Nvidia.
 SimSIMD provides an alternative.
@@ -42,7 +42,7 @@ SimSIMD provides an alternative.
 
 ## Features
 
-__SimSIMD__ (Arabic: "سيمسيم دي") is a mixed-precision math library of __over 200 SIMD-optimized kernels__ extensively used in AI, Search, and DBMS workloads.
+__SimSIMD__ (Arabic: "سيمسيم دي") is a mixed-precision math library of __over 450 SIMD-optimized kernels__ extensively used in AI, Search, and DBMS workloads.
 Named after the iconic ["Open Sesame"](https://en.wikipedia.org/wiki/Open_sesame) command that opened doors to treasure in _Ali Baba and the Forty Thieves_, SimSimd can help you 10x the cost-efficiency of your computational pipelines.
 Implemented distance functions include:
 
@@ -52,7 +52,7 @@ Implemented distance functions include:
 - Set Intersections for Sparse Vectors and Text Analysis. _[docs][docs-sparse]_
 - Mahalanobis distance and Quadratic forms for Scientific Computing. _[docs][docs-curved]_
 - Kullback-Leibler and Jensen–Shannon divergences for probability distributions. _[docs][docs-probability]_
-- Fused-Multiply-Add (FMA) and Weighted Sums to replace BLAS level 1 functions. _[docs][docs-fma]_
+- Fused-Multiply-Add (FMA) and Weighted Sums to replace BLAS level 1 functions. _[docs][docs-elementwise]_
 - For Levenshtein, Needleman–Wunsch, and Smith-Waterman, check [StringZilla][stringzilla].
 - 🔜 Haversine and Vincenty's formulae for Geospatial Analysis.
 
@@ -62,7 +62,7 @@ Implemented distance functions include:
 [docs-binary]: https://github.com/ashvardanian/SimSIMD/pull/138
 [docs-dot]: #complex-dot-products-conjugate-dot-products-and-complex-numbers
 [docs-probability]: #logarithms-in-kullback-leibler--jensenshannon-divergences
-[docs-fma]: #mixed-precision-in-fused-multiply-add-and-weighted-sums
+[docs-elementwise]: #mixed-precision-in-fused-multiply-add-and-weighted-sums
 [scipy]: https://docs.scipy.org/doc/scipy/reference/spatial.distance.html#module-scipy.spatial.distance
 [numpy]: https://numpy.org/doc/stable/reference/generated/numpy.inner.html
 [stringzilla]: https://github.com/ashvardanian/stringzilla
@@ -139,7 +139,7 @@ import numpy as np
 
 vec1 = np.random.randn(1536).astype(np.float32)
 vec2 = np.random.randn(1536).astype(np.float32)
-dist = simsimd.cosine(vec1, vec2)
+dist = simsimd.angular(vec1, vec2)
 ```
 
 Supported functions include `cosine`, `inner`, `sqeuclidean`, `hamming`, `jaccard`, `kulbackleibler`, `jensenshannon`, and `intersect`.
@@ -158,11 +158,11 @@ Unlike SciPy, SimSIMD allows explicitly stating the precision of the input vecto
 The `dtype` argument can be passed both by name and as a positional argument:
 
 ```py
-dist = simsimd.cosine(vec1, vec2, "int8")
-dist = simsimd.cosine(vec1, vec2, "float16")
-dist = simsimd.cosine(vec1, vec2, "float32")
-dist = simsimd.cosine(vec1, vec2, "float64")
-dist = simsimd.hamming(vec1, vec2, "bit8")
+dist = simsimd.angular(vec1, vec2, "int8")
+dist = simsimd.angular(vec1, vec2, "float16")
+dist = simsimd.angular(vec1, vec2, "float32")
+dist = simsimd.angular(vec1, vec2, "float64")
+dist = simsimd.jaccard(vec1, vec2, "bin8") # Binary vectors with 8-bit words
 ```
 
 With other frameworks, like PyTorch, one can get a richer type-system than NumPy, but the lack of good CPython interoperability makes it hard to pass data without copies.
@@ -181,7 +181,7 @@ torch.randn(8, out=vec2)
 
 # Both libs will look into the same memory buffers and report the same results
 dist_slow = 1 - torch.nn.functional.cosine_similarity(vec1, vec2, dim=0)
-dist_fast = simsimd.cosine(buf1, buf2, "bf16")
+dist_fast = simsimd.angular(buf1, buf2, "bf16")
 ```
 
 It also allows using SimSIMD for half-precision complex numbers, which NumPy does not support.
@@ -220,8 +220,8 @@ vec1 = np.random.randn(1536).astype(np.float32) # rank 1 tensor
 batch1 = np.random.randn(1, 1536).astype(np.float32) # rank 2 tensor
 batch2 = np.random.randn(100, 1536).astype(np.float32)
 
-dist_rank1 = simsimd.cosine(vec1, batch2)
-dist_rank2 = simsimd.cosine(batch1, batch2)
+dist_rank1 = simsimd.angular(vec1, batch2)
+dist_rank2 = simsimd.angular(batch1, batch2)
 ```
 
 ### Many-to-Many Distances
@@ -232,7 +232,7 @@ For two batches of 100 vectors to compute 100 distances, one would call it like
 ```py
 batch1 = np.random.randn(100, 1536).astype(np.float32)
 batch2 = np.random.randn(100, 1536).astype(np.float32)
-dist = simsimd.cosine(batch1, batch2)
+dist = simsimd.angular(batch1, batch2)
 ```
 
 Input matrices must have identical shapes.
@@ -609,7 +609,7 @@ import SimSIMD
 let vectorA: [Int8] = [1, 2, 3]
 let vectorB: [Int8] = [4, 5, 6]
 
-let cosineSimilarity = vectorA.cosine(vectorB)  // Computes the cosine similarity
+let cosineSimilarity = vectorA.angular(vectorB)  // Computes the cosine similarity
 let dotProduct = vectorA.dot(vectorB)           // Computes the dot product
 let sqEuclidean = vectorA.sqeuclidean(vectorB)  // Computes the squared Euclidean distance
 ```
@@ -637,9 +637,9 @@ int main() {
     simsimd_f32_t vector_a[1536];
     simsimd_f32_t vector_b[1536];
     simsimd_kernel_punned_t distance_function = simsimd_metric_punned(
-        simsimd_metric_cos_k,   // Metric kind, like the angular cosine distance
-        simsimd_datatype_f32_k, // Data type, like: f16, f32, f64, i8, b8, and complex variants
-        simsimd_cap_any_k);     // Which CPU capabilities are we allowed to use
+        simsimd_angular_k,  // Metric kind, like the angular cosine distance
+        simsimd_f32_k,      // Data type, like: f16, f32, f64, i8, b8, complex variants, etc.
+        simsimd_cap_any_k); // Which CPU capabilities are we allowed to use
     simsimd_distance_t distance;
     distance_function(vector_a, vector_b, 1536, &distance);
     return 0;
@@ -684,10 +684,10 @@ int main() {
     simsimd_distance_t distance;
 
     // Cosine distance between two vectors
-    simsimd_cos_i8(i8s, i8s, 1536, &distance);
-    simsimd_cos_f16(f16s, f16s, 1536, &distance);
-    simsimd_cos_f32(f32s, f32s, 1536, &distance);
-    simsimd_cos_f64(f64s, f64s, 1536, &distance);
+    simsimd_angular_i8(i8s, i8s, 1536, &distance);
+    simsimd_angular_f16(f16s, f16s, 1536, &distance);
+    simsimd_angular_f32(f32s, f32s, 1536, &distance);
+    simsimd_angular_f64(f64s, f64s, 1536, &distance);
 
     // Euclidean distance between two vectors
     simsimd_l2sq_i8(i8s, i8s, 1536, &distance);
@@ -988,24 +988,42 @@ Both functions are defined for non-negative numbers, and the logarithm is a key
 
 ### Mixed Precision in Fused-Multiply-Add and Weighted Sums
 
-The Fused-Multiply-Add (FMA) operation is a single operation that combines element-wise multiplication and addition with different scaling factors.
-The Weighted Sum is it's simplified variant without element-wise multiplication.
+The "Fused-Multiply-Add" (FMA) operation is a single operation that combines element-wise multiplication and addition with different scaling factors.
+The "Weighted Sum" is it's simplified variant without element-wise multiplication.
+The "Sum" operation is a further simplified variant without scaling factors, and "Scale" is the unary equivalent of FMA:
 
 ```math
-\text{FMA}_i(A, B, C, \alpha, \beta) = \alpha \cdot A_i \cdot B_i + \beta \cdot C_i
+\text{Scale}_i(A, \alpha, \beta) = \alpha \cdot A_i + \beta
+```
+
+```math
+\text{Sum}_i(A, B) = A_i + B_i
 ```
 
 ```math
 \text{WSum}_i(A, B, \alpha, \beta) = \alpha \cdot A_i + \beta \cdot B_i
 ```
 
-In NumPy terms, the implementation may look like:
+```math
+\text{FMA}_i(A, B, C, \alpha, \beta) = \alpha \cdot A_i \cdot B_i + \beta \cdot C_i
+```
+
+In NumPy terms, the implementation __may__ look like:
 
 ```py
 import numpy as np
+
+def scale(A: np.ndarray, /, Alpha: float, Beta: float) -> np.ndarray:
+    return (Alpha * A + Beta).astype(A.dtype)
+
+def sum(A: np.ndarray, B: np.ndarray) -> np.ndarray:
+    assert A.dtype == B.dtype, "Input types must match and affect the output style"
+    return (A + B).astype(A.dtype)
+
 def wsum(A: np.ndarray, B: np.ndarray, /, Alpha: float, Beta: float) -> np.ndarray:
     assert A.dtype == B.dtype, "Input types must match and affect the output style"
     return (Alpha * A + Beta * B).astype(A.dtype)
+
 def fma(A: np.ndarray, B: np.ndarray, C: np.ndarray, /, Alpha: float, Beta: float) -> np.ndarray:
     assert A.dtype == B.dtype and A.dtype == C.dtype, "Input types must match and affect the output style"
     return (Alpha * A * B + Beta * C).astype(A.dtype)

diff --git a/build.rs b/build.rs
@@ -8,7 +8,7 @@ fn main() {
         .define("SIMSIMD_NATIVE_BF16", "0")
         .define("SIMSIMD_DYNAMIC_DISPATCH", "1")
         .flag("-O3")
-        .flag("-std=c99") // Enforce C99 standard
+        .flag("-std=c23") // We could enforce the C99 standard, but it's nicer to use `_Float16` in C23
         .flag("-pedantic") // Ensure strict compliance with the C standard
         .warnings(false);