Skip to content

Commit 87731b7

Browse files
author
Hendrik van Antwerpen
committed
Generate non-splittable test string
1 parent bba0de6 commit 87731b7

File tree

5 files changed

+104
-67
lines changed

5 files changed

+104
-67
lines changed

crates/bpe/README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -290,9 +290,8 @@ This suggests that pre-tokenization is not necessary from a performance perspect
290290

291291
![encoding runtime comparison](./images/performance-comparison.svg)
292292

293-
The graph below shows encoding results for input that is particularly challenging for tiktoken.
294-
The input consists of random ranges taken from the continuous list of all Unicode code points excluding whitespace.
295-
The performance of tiktoken shows a quadratic growth with the input size.
293+
The graph below shows encoding results when the input cannot be split in pre-tokenization and allows a better comparison of pure BPE performance.
294+
This case is particularly challenging for tiktoken, which shows a quadratic growth with the input size.
296295
The Huggingface encoder scales better, but becomes slower and slower compared to our implementation as input size increases.
297296

298297
![worst-case encoding runtime comparison](./images/performance-worstcase.svg)

crates/bpe/benchmarks/equivalence.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ const N: usize = 32;
77
fn test_encoding_equivalence_without_pretokenization() {
88
for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
99
let huggingface = without_pretokenizer(huggingface);
10-
let text = create_test_string(&bpe.bpe, 20000);
10+
let text = create_test_string(bpe, 20000, true);
1111
let inputs = (0..N)
1212
.map(|_| select_test_bytes(text.as_bytes(), 100))
1313
.chain(std::iter::once(
@@ -43,7 +43,7 @@ fn test_encoding_equivalence_without_pretokenization() {
4343
#[test]
4444
fn test_encoding_equivalence_with_pretokenization() {
4545
for (_, bpe, tiktoken, huggingface) in TOKENIZERS.iter() {
46-
let text = create_test_string(&bpe.bpe, 20000);
46+
let text = create_test_string(bpe, 20000, true);
4747
let inputs = (0..N)
4848
.map(|_| select_test_bytes(text.as_bytes(), 100))
4949
.chain(std::iter::once(

crates/bpe/benchmarks/lib.rs

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
use std::sync::LazyLock;
22

3-
use bpe::byte_pair_encoding::BytePairEncoding;
43
use bpe_openai::Tokenizer;
54
use rand::{thread_rng, Rng};
65
use tiktoken_rs::CoreBPE as TiktokenTokenizer;
@@ -41,19 +40,38 @@ pub fn is_char_boundary(b: u8) -> bool {
4140
b as i8 >= -0x40 // NB: b < 128 || b >= 192
4241
}
4342

44-
pub fn create_test_string(bpe: &BytePairEncoding, tokens: usize) -> String {
43+
/// Create a test string from the given number of random tokens. Note that re-tokenizing the string
44+
/// may result in a different token count! It is possible to request a string that cannot be split
45+
/// with the tokenizers regex. Be aware that generating the string is slow in that case.
46+
pub fn create_test_string(tok: &Tokenizer, tokens: usize, allow_splits: bool) -> String {
4547
use rand::{thread_rng, Rng};
4648
let mut text = String::new();
47-
for _ in 0..tokens {
48-
loop {
49-
let i = thread_rng().gen_range(0..bpe.num_tokens());
50-
let s = bpe.token_bytes(i as u32);
51-
if s.iter().all(|b| is_char_boundary(*b)) {
52-
if let Ok(s) = std::str::from_utf8(s) {
53-
text.push_str(s);
54-
break;
49+
let mut text_len = Vec::new();
50+
'next_token: while text_len.len() < tokens {
51+
// try a few of times to find a token
52+
for _ in 0..8 {
53+
// ensure the token results in a valid string
54+
loop {
55+
let i = thread_rng().gen_range(0..tok.bpe.num_tokens());
56+
let s = tok.bpe.token_bytes(i as u32);
57+
if s.iter().all(|b| is_char_boundary(*b)) {
58+
if let Ok(s) = std::str::from_utf8(s) {
59+
text_len.push(text.len());
60+
text.push_str(s);
61+
break;
62+
}
5563
}
5664
}
65+
// if splits are allowed, or there are not splits, add the next token, otherwise drop the token and retry
66+
if allow_splits || tok.split(&text).nth(1).is_none() {
67+
continue 'next_token;
68+
} else {
69+
text.truncate(text_len.pop().expect("we just pushed a token"));
70+
}
71+
}
72+
// we failed to find a token that doesn't result in a split, we backtrack to try different combinations
73+
if let Some(len) = text_len.pop() {
74+
text.truncate(len)
5775
}
5876
}
5977
text

crates/bpe/benchmarks/performance.rs

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ fn encoding_benchmark(c: &mut Criterion) {
4545
for (name, bpe, _, huggingface) in TOKENIZERS.iter() {
4646
let huggingface = without_pretokenizer(huggingface);
4747

48-
let text = create_test_string(&bpe.bpe, 20000);
48+
let text = create_test_string(bpe, 20000, true);
4949
let input = text.as_bytes();
5050

5151
let mut group = c.benchmark_group(format!("encoding-{name}"));
@@ -145,7 +145,7 @@ fn appending_benchmark(c: &mut Criterion) {
145145

146146
fn comparison_benchmark(c: &mut Criterion) {
147147
for (name, bpe, tiktoken, huggingface) in TOKENIZERS.iter() {
148-
let text = create_test_string(&bpe.bpe, 20000);
148+
let text = create_test_string(bpe, 20000, true);
149149
let input = text.as_bytes();
150150

151151
let mut group = c.benchmark_group(format!("comparison-{name}"));
@@ -188,26 +188,35 @@ fn comparison_benchmark(c: &mut Criterion) {
188188

189189
fn worstcase_comparison_benchmark(c: &mut Criterion) {
190190
for (name, bpe, tiktoken, huggingface) in TOKENIZERS.iter() {
191-
let text: String = ('\0'..char::MAX).filter(|c| !c.is_whitespace()).collect();
191+
let text = create_test_string(bpe, 20000, false);
192192
let input = text.as_bytes();
193193

194194
let mut group = c.benchmark_group(format!("worstcase-{name}"));
195-
for bytes in [10, 100, 1000, 5000, 10000, 25000, 50000, 75000, 100000] {
195+
for bytes in [10, 100, 1000] { //, 5000, 10000, 25000, 50000, 75000, 100000] {
196196
group.throughput(criterion::Throughput::Bytes(bytes as u64));
197197
group.bench_with_input(
198198
BenchmarkId::new("backtracking", bytes),
199199
&bytes,
200200
|b, bytes| {
201201
b.iter_batched(
202-
|| std::str::from_utf8(select_test_bytes(input, *bytes)).unwrap(),
202+
|| {
203+
let text =
204+
std::str::from_utf8(select_test_bytes(input, *bytes)).unwrap();
205+
assert!(bpe.split(text).nth(1).is_none());
206+
text
207+
},
203208
|text| bpe.encode(text),
204209
criterion::BatchSize::SmallInput,
205210
)
206211
},
207212
);
208213
group.bench_with_input(BenchmarkId::new("tiktoken", bytes), &bytes, |b, bytes| {
209214
b.iter_batched(
210-
|| std::str::from_utf8(select_test_bytes(input, *bytes)).unwrap(),
215+
|| {
216+
let text = std::str::from_utf8(select_test_bytes(input, *bytes)).unwrap();
217+
assert!(bpe.split(text).nth(1).is_none());
218+
text
219+
},
211220
|text| tiktoken.encode_ordinary(text),
212221
criterion::BatchSize::SmallInput,
213222
)
@@ -217,7 +226,12 @@ fn worstcase_comparison_benchmark(c: &mut Criterion) {
217226
&bytes,
218227
|b, bytes| {
219228
b.iter_batched(
220-
|| std::str::from_utf8(select_test_bytes(input, *bytes)).unwrap(),
229+
|| {
230+
let text =
231+
std::str::from_utf8(select_test_bytes(input, *bytes)).unwrap();
232+
assert!(bpe.split(text).nth(1).is_none());
233+
text
234+
},
221235
|text| huggingface.encode_fast(text, false).unwrap(),
222236
criterion::BatchSize::SmallInput,
223237
)

0 commit comments

Comments
 (0)