Add: Placeholder for TF-IDF

ashvardanian/SimSIMD#239
ashvardanian · Dec 8, 2024 · bd23a21 · bd23a21
1 parent b906b91
commit bd23a21
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 1 deletion.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -5,6 +5,7 @@
     "rapidfuzz",
     "rfind",
     "stringwars",
-    "stringzilla"
+    "stringzilla",
+    "tfidf"
   ]
 }
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ So, to accelerate the development of the [`stringzilla`](https://github.com/ashv
 - [`rapidfuzz`](https://github.com/rapidfuzz/rapidfuzz-rs) for edit distances.
 - [`aHash`](https://github.com/tkaitchuck/aHash) for hashing.
 - [`aho_corasick`](https://github.com/BurntSushi/aho-corasick) for multi-pattern search.
+- [`tantivy`](https://github.com/quickwit-oss/tantivy) for document retrieval.
 
 Of course, the functionality of the projects is different, as are the APIs and the usage patterns.
 So, I focus on the workloads for which StringZilla was designed and compare the throughput of the core operations.
@@ -82,6 +83,15 @@ To run them on Linux and MacOS, pass the dataset path as an environment variable
     STRINGWARS_MODE=words STRINGWARS_DATASET=README.md cargo criterion --features bench_hash bench_hash --jobs 8
     ```
 
+- Document retrieval with [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf):
+
+    ```bash
+    STRINGWARS_DATASET=README.md cargo criterion --features bench_tfidf bench_tfidf --jobs 8
+    ```
+
+    The TF-IDF benchmarks compute the term frequency-inverse document frequency for each word in the input file.
+    The benchmark relies on a hybrid of StringZilla and SimSIMD to achieve the best performance.
+
 On Windows using PowerShell you'd need to set the environment variable differently:
 
 ```powershell

diff --git a/bench_tfidf.rs b/bench_tfidf.rs
@@ -0,0 +1,51 @@
+use std::env;
+use std::fs;
+
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+
+use memchr::memmem;
+use stringzilla::StringZilla;
+
+fn configure_bench() -> Criterion {
+    Criterion::default()
+        .sample_size(1000) // Test this many needles.
+        .warm_up_time(std::time::Duration::from_secs(10)) // Let the CPU frequencies settle.
+        .measurement_time(std::time::Duration::from_secs(120)) // Actual measurement time.
+}
+
+fn bench_tfidf(c: &mut Criterion) {
+    // Get the haystack path from the environment variable.
+    let dataset_path =
+        env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set");
+    let haystack_content = fs::read_to_string(&dataset_path).expect("Could not read haystack");
+
+    // Tokenize the haystack content by white space.
+    let needles: Vec<&str> = haystack_content.split_whitespace().collect();
+    if needles.is_empty() {
+        panic!("No tokens found in the haystack.");
+    }
+
+    let haystack = haystack_content.as_bytes();
+    let haystack_length = haystack.len();
+
+    // Benchmarks for forward search
+    let mut g = c.benchmark_group("search-forward");
+    g.throughput(Throughput::Bytes(haystack_length as u64));
+    perform_forward_benchmarks(&mut g, &needles, haystack);
+    g.finish();
+
+    // Benchmarks for reverse search
+    let mut g = c.benchmark_group("search-reverse");
+    g.throughput(Throughput::Bytes(haystack_length as u64));
+    perform_reverse_benchmarks(&mut g, &needles, haystack);
+    g.finish();
+}
+
+...
+
+criterion_group! {
+    name = bench_tfidf_group;
+    config = configure_bench();
+    targets = bench_tfidf
+}
+criterion_main!(bench_tfidf_group);