From 7c87779d4c10dae46faf41b296f1786fa378f348 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 27 Nov 2024 13:03:17 +0000 Subject: [PATCH 1/2] Docs: FMA ports details --- include/simsimd/dot.h | 10 ++++++++++ include/simsimd/spatial.h | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/include/simsimd/dot.h b/include/simsimd/dot.h index 8246aba2..d52e9c16 100644 --- a/include/simsimd/dot.h +++ b/include/simsimd/dot.h @@ -130,6 +130,16 @@ SIMSIMD_PUBLIC void simsimd_dot_u8_haswell(simsimd_u8_t const* a, simsimd_u8_t c * Ice Lake added VNNI, VPOPCNTDQ, IFMA, VBMI, VAES, GFNI, VBMI2, BITALG, VPCLMULQDQ, and other extensions for integral operations. * Genoa added only BF16. * Sapphire Rapids added tiled matrix operations, but we are most interested in the new mixed-precision FMA instructions. + * + * Sadly, we can't effectively interleave different kinds of arithmetic instructions to utilize more ports: + * + * > Like Intel server architectures since Skylake-X, SPR cores feature two 512-bit FMA units, and organize them in a similar fashion. + * > One 512-bit FMA unit is created by fusing two 256-bit ones on port 0 and port 1. The other is added to port 5, as a server-specific + * > core extension. The FMA units on port 0 and 1 are configured into 2×256-bit or 1×512-bit mode depending on whether 512-bit FMA + * > instructions are present in the scheduler. That means a mix of 256-bit and 512-bit FMA instructions will not achieve higher IPC + * > than executing 512-bit instructions alone. + * + * Source: https://chipsandcheese.com/p/a-peek-at-sapphire-rapids */ SIMSIMD_PUBLIC void simsimd_dot_f64_skylake(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* result); SIMSIMD_PUBLIC void simsimd_dot_f64c_skylake(simsimd_f64c_t const* a, simsimd_f64c_t const* b, simsimd_size_t n, simsimd_distance_t* results); diff --git a/include/simsimd/spatial.h b/include/simsimd/spatial.h index 9b3db048..9e2d899a 100644 --- a/include/simsimd/spatial.h +++ b/include/simsimd/spatial.h @@ -139,6 +139,16 @@ SIMSIMD_PUBLIC void simsimd_cos_f64_haswell(simsimd_f64_t const* a, simsimd_f64_ /* SIMD-powered backends for AVX512 CPUs of Skylake generation and newer, using 32-bit arithmetic over 512-bit words. * Skylake was launched in 2015, and discontinued in 2019. Skylake had support for F, CD, VL, DQ, and BW extensions, * as well as masked operations. This is enough to supersede auto-vectorization on `f32` and `f64` types. + * + * Sadly, we can't effectively interleave different kinds of arithmetic instructions to utilize more ports: + * + * > Like Intel server architectures since Skylake-X, SPR cores feature two 512-bit FMA units, and organize them in a similar fashion. + * > One 512-bit FMA unit is created by fusing two 256-bit ones on port 0 and port 1. The other is added to port 5, as a server-specific + * > core extension. The FMA units on port 0 and 1 are configured into 2×256-bit or 1×512-bit mode depending on whether 512-bit FMA + * > instructions are present in the scheduler. That means a mix of 256-bit and 512-bit FMA instructions will not achieve higher IPC + * > than executing 512-bit instructions alone. + * + * Source: https://chipsandcheese.com/p/a-peek-at-sapphire-rapids */ SIMSIMD_PUBLIC void simsimd_l2_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_l2sq_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* d); From c375e3b9da1f09110e03f2cf88f0b4d53723ec28 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 27 Nov 2024 13:04:23 +0000 Subject: [PATCH 2/2] Docs: Navigating the codebase --- CONTRIBUTING.md | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6370a20a..27fa11b8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -6,6 +6,26 @@ To keep the quality of the code high, we have a set of [guidelines](https://gith - [How to organize branches?](https://github.com/unum-cloud/awesome/blob/main/Workflow.md#branches) - [How to style commits?](https://github.com/unum-cloud/awesome/blob/main/Workflow.md#commits) +## Navigating the Codebase + +Primary kernels are implemented in header files under `include/simsimd/`: + +- `dot.h` - dot products for real and complex vectors. +- `spatial.h` - spatial distances: L2, cosine distance. +- `binary.h` - binary distances: Hamming, Jaccard, etc. +- `probability.h` - probability metrics: KL-divergence, Jensen-Shannon, etc. +- `sparse.h` - sparse distances: weighted and normal set intersections. +- `curved.h` - bilinear forms for real and complex vectors, and Mahalanobis distance. + +Bindings to other languages are in the respective directories: + +- `python/lib.c` - Python bindings. +- `javascript/lib.c` - JavaScript bindings. +- `rust/lib.rs` - Rust bindings. +- `swift/SimSIMD.swift` - Swift bindings. + +All tests, benchmarks, and examples are placed in the `scripts/` directory, if compatible with the toolchain of the implementation language. + ## C and C++ To rerun experiments utilize the following command: @@ -277,4 +297,3 @@ cd golang go test # To test go test -run=^$ -bench=. -benchmem # To benchmark ``` -