Skip to content

Commit

Permalink
Revert "feat!: special tokens encoded by default (#512)"
Browse files Browse the repository at this point in the history
This reverts commit 9da8748.
  • Loading branch information
benbrandt committed Jan 17, 2025
1 parent 9d7f705 commit 4a72472
Show file tree
Hide file tree
Showing 15 changed files with 3,282 additions and 4,009 deletions.
8 changes: 4 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Changelog

## v0.21.0
## v0.20.2

### Breaking Changes
### What's New

- Special tokens are now also encoded by both Huggingface and Tiktoken tokenizers. This is closer to the default behavior on the Python side, and should make sure if a model adds tokens at the beginning or end of a sequence, these are accounted for as well. This is especially important for embedding models that can add a special token to the beginning of the sequence, and the chunks generated didn't actually fit within the context window because of this.
#### Python

### What's New
- Minor release to include latest pyo3 and tree-sitter dependencies.

#### Rust

Expand Down
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
members = ["bindings/*"]

[workspace.package]
version = "0.21.0"
version = "0.20.2"
authors = ["Ben Brandt <[email protected]>"]
edition = "2021"
description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens, and is callable from Rust and Python."
Expand Down
20 changes: 10 additions & 10 deletions bindings/python/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,37 +46,37 @@ def test_chunks_trim() -> None:

def test_hugging_face() -> None:
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
splitter = TextSplitter.from_huggingface_tokenizer(tokenizer, 3, trim=False)
splitter = TextSplitter.from_huggingface_tokenizer(tokenizer, 1, trim=False)
text = "123\n123"
assert splitter.chunks(text) == ["123\n", "123"]


def test_hugging_face_range() -> None:
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
splitter = TextSplitter.from_huggingface_tokenizer(
tokenizer, capacity=(3, 4), trim=False
tokenizer, capacity=(1, 2), trim=False
)
text = "123\n123"
assert splitter.chunks(text=text) == ["123\n", "123"]


def test_hugging_face_trim() -> None:
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
splitter = TextSplitter.from_huggingface_tokenizer(tokenizer, 2)
splitter = TextSplitter.from_huggingface_tokenizer(tokenizer, 1)
text = "123\n123"
assert splitter.chunks(text) == ["123", "123"]


def test_hugging_face_from_str() -> None:
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
splitter = TextSplitter.from_huggingface_tokenizer_str(tokenizer.to_str(), 2)
splitter = TextSplitter.from_huggingface_tokenizer_str(tokenizer.to_str(), 1)
text = "123\n123"
assert splitter.chunks(text) == ["123", "123"]


def test_hugging_face_from_file() -> None:
splitter = TextSplitter.from_huggingface_tokenizer_file(
"tests/bert-base-cased.json", 2
"tests/bert-base-cased.json", 1
)
text = "123\n123"
assert splitter.chunks(text) == ["123", "123"]
Expand Down Expand Up @@ -139,39 +139,39 @@ def test_markdown_chunks_trim() -> None:

def test_markdown_hugging_face() -> None:
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
splitter = MarkdownSplitter.from_huggingface_tokenizer(tokenizer, 3, trim=False)
splitter = MarkdownSplitter.from_huggingface_tokenizer(tokenizer, 1, trim=False)
text = "123\n\n123"
assert splitter.chunks(text) == ["123\n", "\n123"]


def test_markdown_hugging_face_range() -> None:
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
splitter = MarkdownSplitter.from_huggingface_tokenizer(
tokenizer, capacity=(3, 4), trim=False
tokenizer, capacity=(1, 2), trim=False
)
text = "123\n\n123"
assert splitter.chunks(text=text) == ["123\n", "\n123"]


def test_markdown_hugging_face_trim() -> None:
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
splitter = MarkdownSplitter.from_huggingface_tokenizer(tokenizer, capacity=2)
splitter = MarkdownSplitter.from_huggingface_tokenizer(tokenizer, capacity=1)
text = "123\n\n123"
assert splitter.chunks(text=text) == ["123", "123"]


def test_markdown_hugging_face_from_str() -> None:
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
splitter = MarkdownSplitter.from_huggingface_tokenizer_str(
tokenizer.to_str(), capacity=2
tokenizer.to_str(), capacity=1
)
text = "123\n\n123"
assert splitter.chunks(text=text) == ["123", "123"]


def test_markdown_hugging_face_from_file() -> None:
splitter = MarkdownSplitter.from_huggingface_tokenizer_file(
"tests/bert-base-cased.json", capacity=2
"tests/bert-base-cased.json", capacity=1
)
text = "123\n\n123"
assert splitter.chunks(text=text) == ["123", "123"]
Expand Down
13 changes: 5 additions & 8 deletions src/chunk_size/huggingface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ impl ChunkSizer for &Tokenizer {
/// encounters text it can't tokenize.
fn size(&self, chunk: &str) -> usize {
let encoding = self
.encode(chunk, true)
.encode(chunk, false)
.expect("Unable to tokenize the following string {chunk}");

let pad_id = self.get_padding().map(|params| params.pad_id);
Expand Down Expand Up @@ -61,8 +61,7 @@ mod tests {
fn returns_size() {
let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None).unwrap();
let size = tokenizer.size(" An apple a");
// Bert has a beginning and end token
assert_eq!(size, 5);
assert_eq!(size, 3);
}

#[test]
Expand All @@ -78,8 +77,7 @@ mod tests {
fn handles_padding() {
let tokenizer = Tokenizer::from_pretrained("thenlper/gte-small", None).unwrap();
let size = tokenizer.size("An apple a");
// Has a beginning and end token
assert_eq!(size, 5);
assert_eq!(size, 3);
}

#[test]
Expand All @@ -89,9 +87,8 @@ mod tests {

// Need to ensure chunk is large enough to cause Encoding overflows.
assert_eq!(
tokenizer.size(" An apple a day keeps the doctor away".repeat(16).as_str()),
// Overflows at 128, with special tokens at beginning and end of each section of tokens
132
tokenizer.size("An apple a day keeps the doctor away.".repeat(100).as_str()),
900
);
}
}
2 changes: 1 addition & 1 deletion src/chunk_size/tiktoken.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::ChunkSizer;
impl ChunkSizer for &CoreBPE {
/// Returns the number of tokens in a given text after tokenization.
fn size(&self, chunk: &str) -> usize {
self.encode_with_special_tokens(chunk).len()
self.encode_ordinary(chunk).len()
}
}

Expand Down
Loading

0 comments on commit 4a72472

Please sign in to comment.