diff --git a/include/simsimd/binary.h b/include/simsimd/binary.h index 8b625b08..bcda6262 100644 --- a/include/simsimd/binary.h +++ b/include/simsimd/binary.h @@ -80,8 +80,11 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_serial(simsimd_b8_t const* a, simsimd_b8_ #if SIMSIMD_TARGET_NEON #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+simd") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_hamming_b8_neon(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words, simsimd_distance_t* result) { simsimd_i32_t differences = 0; @@ -113,15 +116,20 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_neon(simsimd_b8_t const* a, simsimd_b8_t *result = (union_ != 0) ? 1 - (simsimd_f64_t)intersection / (simsimd_f64_t)union_ : 1; } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON #if SIMSIMD_TARGET_SVE #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+sve") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_hamming_b8_sve(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words, simsimd_distance_t* result) { simsimd_size_t i = 0; @@ -151,7 +159,9 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_sve(simsimd_b8_t const* a, simsimd_b8_t c *result = (union_ != 0) ? 1 - (simsimd_f64_t)intersection / (simsimd_f64_t)union_ : 1; } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_SVE #endif // SIMSIMD_TARGET_ARM @@ -160,9 +170,12 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_sve(simsimd_b8_t const* a, simsimd_b8_t c #if SIMSIMD_TARGET_ICE #pragma GCC push_options #pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512vpopcntdq") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512vpopcntdq"))), \ apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_hamming_b8_ice(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words, simsimd_distance_t* result) { __m512i differences_vec = _mm512_setzero_si512(); @@ -216,15 +229,20 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t c *result = (union_ != 0) ? 1 - (simsimd_f64_t)intersection / (simsimd_f64_t)union_ : 1; } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_ICE #if SIMSIMD_TARGET_HASWELL #pragma GCC push_options #pragma GCC target("popcnt") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("popcnt"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_hamming_b8_haswell(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words, simsimd_distance_t* result) { // x86 supports unaligned loads and works just fine with the scalar version for small vectors. @@ -248,7 +266,9 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_haswell(simsimd_b8_t const* a, simsimd_b8 *result = (union_ != 0) ? 1 - (simsimd_f64_t)intersection / (simsimd_f64_t)union_ : 1; } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_HASWELL #endif // SIMSIMD_TARGET_X86 diff --git a/include/simsimd/curved.h b/include/simsimd/curved.h index 69b70559..66e244b2 100644 --- a/include/simsimd/curved.h +++ b/include/simsimd/curved.h @@ -153,8 +153,11 @@ SIMSIMD_MAKE_MAHALANOBIS(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_ma #if SIMSIMD_TARGET_NEON #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+simd") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_bilinear_f32_neon(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_f32_t const* c, simsimd_size_t n, simsimd_distance_t* result) { float32x4_t sum_vec = vdupq_n_f32(0); @@ -220,15 +223,20 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f32_neon(simsimd_f32_t const* a, simsimd *result = _simsimd_sqrt_f64_neon(sum); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON #if SIMSIMD_TARGET_NEON_F16 #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+simd+fp16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_bilinear_f16_neon(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_f16_t const* c, simsimd_size_t n, simsimd_distance_t* result) { float32x4_t sum_vec = vdupq_n_f32(0); @@ -301,15 +309,20 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_neon(simsimd_f16_t const* a, simsimd *result = _simsimd_sqrt_f32_neon(sum); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON_F16 #if SIMSIMD_TARGET_NEON_BF16 #pragma GCC push_options #pragma GCC target("arch=armv8.6-a+simd+bf16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.6-a+simd+bf16"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_bilinear_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_bf16_t const* c, simsimd_size_t n, simsimd_distance_t* result) { float32x4_t sum_vec = vdupq_n_f32(0); @@ -401,7 +414,9 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_neon(simsimd_bf16_t const* a, simsi *result = _simsimd_sqrt_f32_neon(sum); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON_BF16 @@ -411,8 +426,11 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_neon(simsimd_bf16_t const* a, simsi #if SIMSIMD_TARGET_HASWELL #pragma GCC push_options #pragma GCC target("avx2", "f16c", "fma") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,f16c,fma"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_bilinear_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_f16_t const* c, simsimd_size_t n, simsimd_distance_t* result) { __m256 sum_vec = _mm256_setzero_ps(); @@ -558,15 +576,20 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_haswell(simsimd_bf16_t const* a, si *result = _simsimd_sqrt_f32_haswell(sum); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_HASWELL #if SIMSIMD_TARGET_SKYLAKE #pragma GCC push_options #pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_bilinear_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_f32_t const* c, simsimd_size_t n, simsimd_distance_t* result) { simsimd_size_t tail_length = n % 16; @@ -634,16 +657,21 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f32_skylake(simsimd_f32_t const* a, sims *result = _simsimd_sqrt_f64_haswell(_mm512_reduce_add_ps(sum_vec)); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_SKYLAKE #if SIMSIMD_TARGET_GENOA #pragma GCC push_options #pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512bf16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512bf16"))), \ apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_bilinear_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_bf16_t const* c, simsimd_size_t n, simsimd_distance_t* result) { simsimd_size_t tail_length = n % 32; @@ -711,16 +739,21 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_genoa(simsimd_bf16_t const* a, sims *result = _simsimd_sqrt_f32_haswell(_mm512_reduce_add_ps(sum_vec)); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_GENOA #if SIMSIMD_TARGET_SAPPHIRE #pragma GCC push_options #pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512fp16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512fp16"))), \ apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_bilinear_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_f16_t const* c, simsimd_size_t n, simsimd_distance_t* result) { @@ -792,7 +825,9 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_sapphire(simsimd_f16_t const* a, sim *result = _simsimd_sqrt_f32_haswell(_mm512_reduce_add_ph(sum_vec)); } + #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_SAPPHIRE #endif // SIMSIMD_TARGET_X86 diff --git a/include/simsimd/dot.h b/include/simsimd/dot.h index 3cb7959f..61b4aff7 100644 --- a/include/simsimd/dot.h +++ b/include/simsimd/dot.h @@ -227,8 +227,11 @@ SIMSIMD_MAKE_COMPLEX_VDOT(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_v #if SIMSIMD_TARGET_NEON #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+simd") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function) +#endif + SIMSIMD_INTERNAL float32x4_t _simsimd_partial_load_f32x4_neon(simsimd_f32_t const* a, simsimd_size_t n) { union { float32x4_t vec; @@ -327,13 +330,18 @@ SIMSIMD_PUBLIC void simsimd_vdot_f32c_neon(simsimd_f32_t const* a, simsimd_f32_t results[1] = ab_imag; } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+dotprod") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+dotprod"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_dot_i8_neon(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result) { @@ -364,15 +372,20 @@ SIMSIMD_PUBLIC void simsimd_dot_i8_neon(simsimd_i8_t const* a, simsimd_i8_t cons *result = ab; } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif #if SIMSIMD_TARGET_NEON_F16 #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+simd+fp16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16"))), apply_to = function) +#endif + SIMSIMD_INTERNAL float16x4_t _simsimd_partial_load_f16x4_neon(simsimd_f16_t const* a, simsimd_size_t n) { // In case the software emulation for `f16` scalars is enabled, the `simsimd_f16_to_f32` // function will run. It is extremely slow, so even for the tail, let's combine serial @@ -481,15 +494,20 @@ SIMSIMD_PUBLIC void simsimd_vdot_f16c_neon(simsimd_f16_t const* a, simsimd_f16_t results[1] += vaddvq_f32(ab_imag_vec); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON_F16 #if SIMSIMD_TARGET_NEON_BF16 #pragma GCC push_options #pragma GCC target("arch=armv8.6-a+simd+bf16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.6-a+simd+bf16"))), apply_to = function) +#endif + SIMSIMD_INTERNAL bfloat16x8_t _simsimd_partial_load_bf16x8_neon(simsimd_bf16_t const* a, simsimd_size_t n) { union { bfloat16x8_t vec; @@ -595,7 +613,9 @@ SIMSIMD_PUBLIC void simsimd_vdot_bf16c_neon(simsimd_bf16_t const* a, simsimd_bf1 results[1] += vaddvq_f32(ab_imag_vec); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON_BF16 @@ -603,8 +623,11 @@ SIMSIMD_PUBLIC void simsimd_vdot_bf16c_neon(simsimd_bf16_t const* a, simsimd_bf1 #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+sve") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_dot_f32_sve(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result) { simsimd_size_t i = 0; @@ -725,13 +748,18 @@ SIMSIMD_PUBLIC void simsimd_vdot_f64c_sve(simsimd_f64_t const* a, simsimd_f64_t results[1] = svaddv_f64(svptrue_b64(), ab_imag_vec); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+sve+fp16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+fp16"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_dot_f16_sve(simsimd_f16_t const* a_enum, simsimd_f16_t const* b_enum, simsimd_size_t n, simsimd_distance_t* result) { simsimd_size_t i = 0; @@ -795,7 +823,9 @@ SIMSIMD_PUBLIC void simsimd_vdot_f16c_sve(simsimd_f16_t const* a, simsimd_f16_t results[1] = svaddv_f16(svptrue_b16(), ab_imag_vec); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_SVE #endif // SIMSIMD_TARGET_ARM @@ -804,8 +834,11 @@ SIMSIMD_PUBLIC void simsimd_vdot_f16c_sve(simsimd_f16_t const* a, simsimd_f16_t #if SIMSIMD_TARGET_HASWELL #pragma GCC push_options #pragma GCC target("avx2", "f16c", "fma") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,f16c,fma"))), apply_to = function) +#endif + SIMSIMD_INTERNAL simsimd_f64_t _simsimd_reduce_f32x8_haswell(__m256 vec) { // Convert the lower and higher 128-bit lanes of the input vector to double precision __m128 low_f32 = _mm256_castps256_ps128(vec); @@ -1153,15 +1186,20 @@ SIMSIMD_PUBLIC void simsimd_dot_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf *result = _simsimd_reduce_f32x8_haswell(ab_vec); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_HASWELL #if SIMSIMD_TARGET_SKYLAKE #pragma GCC push_options #pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "bmi2") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,avx512bw,bmi2"))), apply_to = function) +#endif + SIMSIMD_INTERNAL simsimd_f64_t _simsimd_reduce_f32x16_skylake(__m512 a) { __m512 x = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(0, 0, 3, 2))); __m128 r = _mm512_castps512_ps128(_mm512_add_ps(x, _mm512_shuffle_f32x4(x, x, _MM_SHUFFLE(0, 0, 0, 1)))); @@ -1397,16 +1435,21 @@ SIMSIMD_PUBLIC void simsimd_vdot_f64c_skylake(simsimd_f64_t const* a, simsimd_f6 results[1] = _mm512_reduce_add_pd(ab_imag_vec); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_SKYLAKE #if SIMSIMD_TARGET_GENOA #pragma GCC push_options #pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512bf16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512bf16"))), \ apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_dot_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result) { __m512 ab_vec = _mm512_setzero_ps(); @@ -1515,16 +1558,21 @@ SIMSIMD_PUBLIC void simsimd_vdot_bf16c_genoa(simsimd_bf16_t const* a, simsimd_bf results[1] = _simsimd_reduce_f32x16_skylake(ab_imag_vec); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_GENOA #if SIMSIMD_TARGET_SAPPHIRE #pragma GCC push_options #pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512fp16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512fp16"))), \ apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_dot_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result) { __m512h ab_vec = _mm512_setzero_ph(); @@ -1636,16 +1684,21 @@ SIMSIMD_PUBLIC void simsimd_vdot_f16c_sapphire(simsimd_f16_t const* a, simsimd_f results[1] = _mm512_reduce_add_ph(ab_imag_vec); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_SAPPHIRE #if SIMSIMD_TARGET_ICE #pragma GCC push_options #pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512vnni") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512vnni"))), \ apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_dot_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result) { __m512i ab_i32s_vec = _mm512_setzero_si512(); @@ -1674,7 +1727,9 @@ SIMSIMD_PUBLIC void simsimd_dot_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const *result = _mm512_reduce_add_epi32(ab_i32s_vec); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_ICE #endif // SIMSIMD_TARGET_X86 diff --git a/include/simsimd/probability.h b/include/simsimd/probability.h index 901f020a..d2d4612d 100644 --- a/include/simsimd/probability.h +++ b/include/simsimd/probability.h @@ -136,8 +136,11 @@ SIMSIMD_MAKE_JS(accurate, bf16, f64, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_DIVISION_E #if SIMSIMD_TARGET_NEON #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+simd") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function) +#endif + SIMSIMD_PUBLIC float32x4_t _simsimd_log2_f32_neon(float32x4_t x) { // Extracting the exponent int32x4_t i = vreinterpretq_s32_f32(x); @@ -227,15 +230,20 @@ SIMSIMD_PUBLIC void simsimd_js_f32_neon(simsimd_f32_t const* a, simsimd_f32_t co *result = sum / 2; } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON #if SIMSIMD_TARGET_NEON_F16 #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+simd+fp16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_kl_f16_neon(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result) { float32x4_t sum_vec = vdupq_n_f32(0); @@ -300,7 +308,9 @@ SIMSIMD_PUBLIC void simsimd_js_f16_neon(simsimd_f16_t const* a, simsimd_f16_t co *result = sum / 2; } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON_F16 #endif // SIMSIMD_TARGET_ARM @@ -309,8 +319,11 @@ SIMSIMD_PUBLIC void simsimd_js_f16_neon(simsimd_f16_t const* a, simsimd_f16_t co #if SIMSIMD_TARGET_HASWELL #pragma GCC push_options #pragma GCC target("avx2", "f16c", "fma") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,f16c,fma"))), apply_to = function) +#endif + SIMSIMD_INTERNAL __m256 _simsimd_log2_f32_haswell(__m256 x) { // Extracting the exponent __m256i i = _mm256_castps_si256(x); @@ -405,15 +418,20 @@ SIMSIMD_PUBLIC void simsimd_js_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t *result = sum / 2; } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_HASWELL #if SIMSIMD_TARGET_SKYLAKE #pragma GCC push_options #pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2"))), apply_to = function) +#endif + SIMSIMD_INTERNAL __m512 _simsimd_log2_f32_skylake(__m512 x) { // Extract the exponent and mantissa __m512 one = _mm512_set1_ps(1.0f); @@ -497,15 +515,20 @@ SIMSIMD_PUBLIC void simsimd_js_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t *result = _mm512_reduce_add_ps(_mm512_add_ps(sum_a_vec, sum_b_vec)) * log2_normalizer / 2; } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_HASWELL #if SIMSIMD_TARGET_SAPPHIRE #pragma GCC push_options #pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512fp16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512fp16"))), apply_to = function) +#endif + SIMSIMD_INTERNAL __m512h _simsimd_log2_f16_sapphire(__m512h x) { // Extract the exponent and mantissa __m512h one = _mm512_set1_ph((simsimd_f16_t)1); @@ -587,7 +610,9 @@ SIMSIMD_PUBLIC void simsimd_js_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_ *result = _mm512_reduce_add_ph(_mm512_add_ph(sum_a_vec, sum_b_vec)) * log2_normalizer / 2; } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_SAPPHIRE #endif // SIMSIMD_TARGET_X86 diff --git a/include/simsimd/simsimd.h b/include/simsimd/simsimd.h index 2ad0a869..2a2d5d79 100644 --- a/include/simsimd/simsimd.h +++ b/include/simsimd/simsimd.h @@ -355,8 +355,11 @@ SIMSIMD_PUBLIC simsimd_capability_t simsimd_capabilities_x86(void) { */ #pragma GCC push_options #pragma GCC target("arch=armv8.5-a+sve") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.5-a+sve"))), apply_to = function) +#endif + /** * @brief Function to determine the SIMD capabilities of the current 64-bit Arm machine at @b runtime. * @return A bitmask of the SIMD capabilities represented as a `simsimd_capability_t` enum value. @@ -460,7 +463,9 @@ SIMSIMD_PUBLIC simsimd_capability_t simsimd_capabilities_arm(void) { #endif } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif @@ -482,9 +487,11 @@ SIMSIMD_PUBLIC simsimd_capability_t simsimd_capabilities_implementation(void) { #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wcast-function-type" #pragma GCC diagnostic ignored "-Wvolatile" +#ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wcast-function-type" #pragma clang diagnostic ignored "-Wvolatile" +#endif /** * @brief Determines the best suited metric implementation based on the given datatype, @@ -1091,8 +1098,11 @@ SIMSIMD_PUBLIC void simsimd_find_metric_punned( // } #pragma GCC diagnostic pop +#ifdef __clang__ #pragma clang diagnostic pop +#endif + /** * @brief Selects the most suitable metric implementation based on the given metric kind, datatype, * and allowed capabilities. @b Don't call too often and prefer caching the `simsimd_capabilities()`. diff --git a/include/simsimd/sparse.h b/include/simsimd/sparse.h index 0fc2fbb6..8d8b3a69 100644 --- a/include/simsimd/sparse.h +++ b/include/simsimd/sparse.h @@ -152,9 +152,12 @@ SIMSIMD_MAKE_INTERSECT_GALLOPING(serial, u32, size) // simsimd_intersect_u32_ser #if SIMSIMD_TARGET_ICE #pragma GCC push_options #pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "lzcnt", "popcnt", "avx512bw", "avx512vbmi2") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,lzcnt,popcnt,avx512bw,avx512vbmi2"))), \ apply_to = function) +#endif + /* The AVX-512 implementations are inspired by the "Faster-Than-Native Alternatives * for x86 VP2INTERSECT Instructions" paper by Guille Diez-Canas, 2022. * @@ -384,7 +387,9 @@ SIMSIMD_PUBLIC void simsimd_intersect_u32_ice(simsimd_u32_t const* a, simsimd_u3 *results += c; } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_ICE #endif // SIMSIMD_TARGET_X86 @@ -393,8 +398,11 @@ SIMSIMD_PUBLIC void simsimd_intersect_u32_ice(simsimd_u32_t const* a, simsimd_u3 #if SIMSIMD_TARGET_NEON #pragma GCC push_options #pragma GCC target("arch=armv8.2-a") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a"))), apply_to = function) +#endif + /** * @brief Uses `vshrn` to produce a bitmask, similar to `movemask` in SSE. * https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon @@ -599,15 +607,20 @@ SIMSIMD_PUBLIC void simsimd_intersect_u32_neon(simsimd_u32_t const* a, simsimd_u *results += vaddvq_u32(c_counts_vec.u32x4); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON #if SIMSIMD_TARGET_SVE2 #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+sve+sve2") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2"))), apply_to = function) +#endif + /* SVE2 introduces many new integer-oriented instructions, extending some of the NEON functionality to * variable-length SVE registers. Those include "compare multiple" intrinsics: * @@ -795,7 +808,9 @@ SIMSIMD_PUBLIC void simsimd_intersect_u32_sve2(simsimd_u32_t const* a, simsimd_u *results = c; } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_SVE2 #endif // SIMSIMD_TARGET_ARM diff --git a/include/simsimd/spatial.h b/include/simsimd/spatial.h index fd5b5318..4a823a27 100644 --- a/include/simsimd/spatial.h +++ b/include/simsimd/spatial.h @@ -192,7 +192,9 @@ SIMSIMD_MAKE_COS(accurate, i8, i32, SIMSIMD_DEREFERENCE) // simsimd_cos_i8_accu #if SIMSIMD_TARGET_NEON #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+simd") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function) +#endif SIMSIMD_INTERNAL simsimd_f32_t _simsimd_sqrt_f32_neon(simsimd_f32_t x) { return vget_lane_f32(vsqrt_f32(vdup_n_f32(x)), 0); @@ -323,15 +325,20 @@ SIMSIMD_PUBLIC void simsimd_cos_f64_neon(simsimd_f64_t const* a, simsimd_f64_t c *result = _simsimd_cos_normalize_f64_neon(ab, a2, b2); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON #if SIMSIMD_TARGET_NEON_F16 #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+simd+fp16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_l2sq_f16_neon(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result) { float32x4_t a_vec, b_vec; @@ -380,15 +387,20 @@ SIMSIMD_PUBLIC void simsimd_cos_f16_neon(simsimd_f16_t const* a, simsimd_f16_t c *result = _simsimd_cos_normalize_f32_neon(ab, a2, b2); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON_F16 #if SIMSIMD_TARGET_NEON_BF16 #pragma GCC push_options #pragma GCC target("arch=armv8.6-a+simd+bf16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.6-a+simd+bf16"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_cos_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result) { @@ -484,15 +496,20 @@ SIMSIMD_PUBLIC void simsimd_l2sq_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16 *result = vaddvq_f32(vaddq_f32(sum_high_vec, sum_low_vec)); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON_BF16 #if SIMSIMD_TARGET_NEON_I8 #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+dotprod+i8mm") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+dotprod+i8mm"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_l2sq_i8_neon(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result) { int32x4_t d2_vec = vdupq_n_s32(0); @@ -634,15 +651,20 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_neon(simsimd_i8_t const* a, simsimd_i8_t cons *result = _simsimd_cos_normalize_f32_neon(ab, a2, b2); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON_I8 #if SIMSIMD_TARGET_SVE #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+sve") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_l2sq_f32_sve(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result) { simsimd_size_t i = 0; @@ -719,15 +741,20 @@ SIMSIMD_PUBLIC void simsimd_cos_f64_sve(simsimd_f64_t const* a, simsimd_f64_t co *result = _simsimd_cos_normalize_f64_neon(ab, a2, b2); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_SVE #if SIMSIMD_TARGET_SVE_F16 #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+sve+fp16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+fp16"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_l2sq_f16_sve(simsimd_f16_t const* a_enum, simsimd_f16_t const* b_enum, simsimd_size_t n, simsimd_distance_t* result) { simsimd_size_t i = 0; @@ -770,15 +797,20 @@ SIMSIMD_PUBLIC void simsimd_cos_f16_sve(simsimd_f16_t const* a_enum, simsimd_f16 *result = _simsimd_cos_normalize_f32_neon(ab, a2, b2); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_SVE_F16 #if SIMSIMD_TARGET_SVE_BF16 #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+sve+bf16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+bf16"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_l2sq_bf16_sve(simsimd_bf16_t const* a_enum, simsimd_bf16_t const* b_enum, simsimd_size_t n, simsimd_distance_t* result) { simsimd_size_t i = 0; @@ -833,7 +865,9 @@ SIMSIMD_PUBLIC void simsimd_cos_bf16_sve(simsimd_bf16_t const* a_enum, simsimd_b *result = _simsimd_cos_normalize_f32_neon(ab, a2, b2); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_SVE_BF16 #endif // SIMSIMD_TARGET_ARM @@ -842,7 +876,9 @@ SIMSIMD_PUBLIC void simsimd_cos_bf16_sve(simsimd_bf16_t const* a_enum, simsimd_b #if SIMSIMD_TARGET_HASWELL #pragma GCC push_options #pragma GCC target("avx2") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function) +#endif SIMSIMD_INTERNAL simsimd_f32_t _simsimd_sqrt_f32_haswell(simsimd_f32_t x) { return _mm_cvtss_f32(_mm_sqrt_ps(_mm_set_ss(x))); @@ -850,7 +886,6 @@ SIMSIMD_INTERNAL simsimd_f32_t _simsimd_sqrt_f32_haswell(simsimd_f32_t x) { SIMSIMD_INTERNAL simsimd_f64_t _simsimd_sqrt_f64_haswell(simsimd_f64_t x) { return _mm_cvtsd_f64(_mm_sqrt_pd(_mm_set_sd(x))); } - SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f64_haswell(simsimd_f64_t ab, simsimd_f64_t a2, simsimd_f64_t b2) { @@ -913,7 +948,9 @@ SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f32_haswell(simsimd_f return result > 0 ? result : 0; } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_HASWELL #endif // SIMSIMD_TARGET_X86 @@ -922,8 +959,11 @@ SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f32_haswell(simsimd_f #if SIMSIMD_TARGET_HASWELL #pragma GCC push_options #pragma GCC target("avx2", "f16c", "fma") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,f16c,fma"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_l2sq_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result) { __m256 a_vec, b_vec; @@ -1174,15 +1214,20 @@ SIMSIMD_PUBLIC void simsimd_cos_f32_haswell(simsimd_f32_t const* a, simsimd_f32_ *result = _simsimd_cos_normalize_f64_haswell(ab, a2, b2); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_HASWELL #if SIMSIMD_TARGET_SKYLAKE #pragma GCC push_options #pragma GCC target("avx2", "avx512f", "avx512bw", "avx512vl", "bmi2") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512bw,avx512vl,bmi2"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_l2sq_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result) { __m512 d2_vec = _mm512_setzero(); @@ -1335,16 +1380,21 @@ SIMSIMD_PUBLIC void simsimd_cos_f64_skylake(simsimd_f64_t const* a, simsimd_f64_ *result = _simsimd_cos_normalize_f64_skylake(ab, a2, b2); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_SKYLAKE #if SIMSIMD_TARGET_GENOA #pragma GCC push_options #pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512bf16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512bf16"))), \ apply_to = function) +#endif + SIMSIMD_INTERNAL __m512i _simsimd_substract_bf16x32_genoa(__m512i a_i16, __m512i b_i16) { union { @@ -1451,15 +1501,20 @@ SIMSIMD_PUBLIC void simsimd_cos_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16 *result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_GENOA #if SIMSIMD_TARGET_SAPPHIRE #pragma GCC push_options #pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512fp16") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512fp16"))), apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_l2sq_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* result) { __m512h d2_vec = _mm512_setzero_ph(); @@ -1514,16 +1569,21 @@ SIMSIMD_PUBLIC void simsimd_cos_f16_sapphire(simsimd_f16_t const* a, simsimd_f16 *result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_SAPPHIRE #if SIMSIMD_TARGET_ICE #pragma GCC push_options #pragma GCC target("avx2", "avx512f", "avx512vl", "bmi2", "avx512bw", "avx512vnni") +#ifdef __clang__ #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512vnni"))), \ apply_to = function) +#endif + SIMSIMD_PUBLIC void simsimd_l2sq_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result) { __m512i d2_i32s_vec = _mm512_setzero_si512(); @@ -1598,7 +1658,9 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const *result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2); } +#ifdef __clang__ #pragma clang attribute pop +#endif #pragma GCC pop_options #endif // SIMSIMD_TARGET_ICE #endif // SIMSIMD_TARGET_X86