@@ -45,7 +45,7 @@ fn encoding_benchmark(c: &mut Criterion) {
4545 for ( name, bpe, _, huggingface) in TOKENIZERS . iter ( ) {
4646 let huggingface = without_pretokenizer ( huggingface) ;
4747
48- let text = create_test_string ( & bpe. bpe , 20000 ) ;
48+ let text = create_test_string ( bpe, 20000 , true ) ;
4949 let input = text. as_bytes ( ) ;
5050
5151 let mut group = c. benchmark_group ( format ! ( "encoding-{name}" ) ) ;
@@ -145,7 +145,7 @@ fn appending_benchmark(c: &mut Criterion) {
145145
146146fn comparison_benchmark ( c : & mut Criterion ) {
147147 for ( name, bpe, tiktoken, huggingface) in TOKENIZERS . iter ( ) {
148- let text = create_test_string ( & bpe. bpe , 20000 ) ;
148+ let text = create_test_string ( bpe, 20000 , true ) ;
149149 let input = text. as_bytes ( ) ;
150150
151151 let mut group = c. benchmark_group ( format ! ( "comparison-{name}" ) ) ;
@@ -188,26 +188,35 @@ fn comparison_benchmark(c: &mut Criterion) {
188188
189189fn worstcase_comparison_benchmark ( c : & mut Criterion ) {
190190 for ( name, bpe, tiktoken, huggingface) in TOKENIZERS . iter ( ) {
191- let text: String = ( '\0' ..char :: MAX ) . filter ( |c| !c . is_whitespace ( ) ) . collect ( ) ;
191+ let text = create_test_string ( bpe , 20000 , false ) ;
192192 let input = text. as_bytes ( ) ;
193193
194194 let mut group = c. benchmark_group ( format ! ( "worstcase-{name}" ) ) ;
195- for bytes in [ 10 , 100 , 1000 , 5000 , 10000 , 25000 , 50000 , 75000 , 100000 ] {
195+ for bytes in [ 10 , 100 , 1000 ] { // , 5000, 10000, 25000, 50000, 75000, 100000] {
196196 group. throughput ( criterion:: Throughput :: Bytes ( bytes as u64 ) ) ;
197197 group. bench_with_input (
198198 BenchmarkId :: new ( "backtracking" , bytes) ,
199199 & bytes,
200200 |b, bytes| {
201201 b. iter_batched (
202- || std:: str:: from_utf8 ( select_test_bytes ( input, * bytes) ) . unwrap ( ) ,
202+ || {
203+ let text =
204+ std:: str:: from_utf8 ( select_test_bytes ( input, * bytes) ) . unwrap ( ) ;
205+ assert ! ( bpe. split( text) . nth( 1 ) . is_none( ) ) ;
206+ text
207+ } ,
203208 |text| bpe. encode ( text) ,
204209 criterion:: BatchSize :: SmallInput ,
205210 )
206211 } ,
207212 ) ;
208213 group. bench_with_input ( BenchmarkId :: new ( "tiktoken" , bytes) , & bytes, |b, bytes| {
209214 b. iter_batched (
210- || std:: str:: from_utf8 ( select_test_bytes ( input, * bytes) ) . unwrap ( ) ,
215+ || {
216+ let text = std:: str:: from_utf8 ( select_test_bytes ( input, * bytes) ) . unwrap ( ) ;
217+ assert ! ( bpe. split( text) . nth( 1 ) . is_none( ) ) ;
218+ text
219+ } ,
211220 |text| tiktoken. encode_ordinary ( text) ,
212221 criterion:: BatchSize :: SmallInput ,
213222 )
@@ -217,7 +226,12 @@ fn worstcase_comparison_benchmark(c: &mut Criterion) {
217226 & bytes,
218227 |b, bytes| {
219228 b. iter_batched (
220- || std:: str:: from_utf8 ( select_test_bytes ( input, * bytes) ) . unwrap ( ) ,
229+ || {
230+ let text =
231+ std:: str:: from_utf8 ( select_test_bytes ( input, * bytes) ) . unwrap ( ) ;
232+ assert ! ( bpe. split( text) . nth( 1 ) . is_none( ) ) ;
233+ text
234+ } ,
221235 |text| huggingface. encode_fast ( text, false ) . unwrap ( ) ,
222236 criterion:: BatchSize :: SmallInput ,
223237 )
0 commit comments