github
diff --git a/‎crates/bpe/README.md‎
Lines changed: 2 additions & 3 deletions b/‎crates/bpe/README.md‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎crates/bpe/benchmarks/performance.rs‎
Lines changed: 25 additions & 9 deletions b/‎crates/bpe/benchmarks/performance.rs‎
Lines changed: 25 additions & 9 deletions
@@ -291,9 +291,8 @@ This suggests that pre-tokenization is not necessary from a performance perspect
 
 ![encoding runtime comparison](./images/performance-comparison.svg)
 
-The graph below shows encoding results for input that is particularly challenging for tiktoken.
-The input consists of random ranges taken from the continuous list of all Unicode code points excluding whitespace.
-The performance of tiktoken shows a quadratic growth with the input size.
+The graph below shows encoding results when the input cannot be split in pre-tokenization and allows a better comparison of pure BPE performance.
+This case is particularly challenging for tiktoken, which shows a quadratic growth with the input size.
 The Huggingface encoder scales better, but becomes slower and slower compared to our implementation as input size increases.
 
 ![worst-case encoding runtime comparison](./images/performance-worstcase.svg)
@@ -1,7 +1,9 @@
 use std::time::Duration;
 
 use bpe::appendable_encoder::AppendableEncoder;
-use bpe::byte_pair_encoding::{create_test_string, select_test_string};
+use bpe::byte_pair_encoding::{
+    create_test_string, create_test_string_with_predicate, select_test_string,
+};
 use bpe::interval_encoding::IntervalEncoding;
 use bpe_benchmarks::*;
 use criterion::{
@@ -11,7 +13,7 @@ use rand::{thread_rng, Rng};
 
 fn counting_benchmark(c: &mut Criterion) {
     for (name, bpe, _, _) in TOKENIZERS.iter() {
-        let input = create_test_string(&bpe.bpe, 80000);
+        let input = create_test_string(&bpe.bpe, 80_000);
         let fast = IntervalEncoding::new(&bpe.bpe, input.as_bytes());
 
         let mut group = c.benchmark_group(format!("counting-{name}"));
@@ -185,26 +187,36 @@ fn comparison_benchmark(c: &mut Criterion) {
 }
 
 fn worstcase_comparison_benchmark(c: &mut Criterion) {
-    for (name, bpe, tiktoken, huggingface) in TOKENIZERS.iter() {
-        let text: String = ('\0'..char::MAX).filter(|c| !c.is_whitespace()).collect();
+    for (name, tok, tiktoken, huggingface) in TOKENIZERS.iter() {
+        let text = create_test_string_with_predicate(&tok.bpe, 100000, |text| {
+            tok.split(text).nth(1).is_none()
+        });
 
         let mut group = c.benchmark_group(format!("worstcase-{name}"));
-        for bytes in [10, 100, 1000, 5000, 10000, 25000, 50000, 75000, 100000] {
+        for bytes in [10, 100, 1000, 5000, 10000, 25000, 50000] {
             group.throughput(criterion::Throughput::Bytes(bytes as u64));
             group.bench_with_input(
                 BenchmarkId::new("backtracking", bytes),
                 &bytes,
                 |b, bytes| {
                     b.iter_batched(
-                        || select_test_string(&text, *bytes),
-                        |text| bpe.encode(text),
+                        || {
+                            let text = select_test_string(&text, *bytes);
+                            assert!(tok.split(text).nth(1).is_none());
+                            text
+                        },
+                        |text| tok.encode(text),
                         criterion::BatchSize::SmallInput,
                     )
                 },
             );
             group.bench_with_input(BenchmarkId::new("tiktoken", bytes), &bytes, |b, bytes| {
                 b.iter_batched(
-                    || select_test_string(&text, *bytes),
+                    || {
+                        let text = select_test_string(&text, *bytes);
+                        assert!(tok.split(text).nth(1).is_none());
+                        text
+                    },
                     |text| tiktoken.encode_ordinary(text),
                     criterion::BatchSize::SmallInput,
                 )
@@ -214,7 +226,11 @@ fn worstcase_comparison_benchmark(c: &mut Criterion) {
                 &bytes,
                 |b, bytes| {
                     b.iter_batched(
-                        || select_test_string(&text, *bytes),
+                        || {
+                            let text = select_test_string(&text, *bytes);
+                            assert!(tok.split(text).nth(1).is_none());
+                            text
+                        },
                         |text| huggingface.encode_fast(text, false).unwrap(),
                         criterion::BatchSize::SmallInput,
                     )