11use std:: time:: Duration ;
22
33use bpe:: appendable_encoder:: AppendableEncoder ;
4- use bpe:: byte_pair_encoding:: { create_test_string, select_test_string} ;
4+ use bpe:: byte_pair_encoding:: {
5+ create_test_string, create_test_string_with_predicate, select_test_string,
6+ } ;
57use bpe:: interval_encoding:: IntervalEncoding ;
68use bpe_benchmarks:: * ;
79use criterion:: {
@@ -11,7 +13,7 @@ use rand::{thread_rng, Rng};
1113
1214fn counting_benchmark ( c : & mut Criterion ) {
1315 for ( name, bpe, _, _) in TOKENIZERS . iter ( ) {
14- let input = create_test_string ( & bpe. bpe , 80000 ) ;
16+ let input = create_test_string ( & bpe. bpe , 80_000 ) ;
1517 let fast = IntervalEncoding :: new ( & bpe. bpe , input. as_bytes ( ) ) ;
1618
1719 let mut group = c. benchmark_group ( format ! ( "counting-{name}" ) ) ;
@@ -185,26 +187,36 @@ fn comparison_benchmark(c: &mut Criterion) {
185187}
186188
187189fn worstcase_comparison_benchmark ( c : & mut Criterion ) {
188- for ( name, bpe, tiktoken, huggingface) in TOKENIZERS . iter ( ) {
189- let text: String = ( '\0' ..char:: MAX ) . filter ( |c| !c. is_whitespace ( ) ) . collect ( ) ;
190+ for ( name, tok, tiktoken, huggingface) in TOKENIZERS . iter ( ) {
191+ let text = create_test_string_with_predicate ( & tok. bpe , 100000 , |text| {
192+ tok. split ( text) . nth ( 1 ) . is_none ( )
193+ } ) ;
190194
191195 let mut group = c. benchmark_group ( format ! ( "worstcase-{name}" ) ) ;
192- for bytes in [ 10 , 100 , 1000 , 5000 , 10000 , 25000 , 50000 , 75000 , 100000 ] {
196+ for bytes in [ 10 , 100 , 1000 , 5000 , 10000 , 25000 , 50000 ] {
193197 group. throughput ( criterion:: Throughput :: Bytes ( bytes as u64 ) ) ;
194198 group. bench_with_input (
195199 BenchmarkId :: new ( "backtracking" , bytes) ,
196200 & bytes,
197201 |b, bytes| {
198202 b. iter_batched (
199- || select_test_string ( & text, * bytes) ,
200- |text| bpe. encode ( text) ,
203+ || {
204+ let text = select_test_string ( & text, * bytes) ;
205+ assert ! ( tok. split( text) . nth( 1 ) . is_none( ) ) ;
206+ text
207+ } ,
208+ |text| tok. encode ( text) ,
201209 criterion:: BatchSize :: SmallInput ,
202210 )
203211 } ,
204212 ) ;
205213 group. bench_with_input ( BenchmarkId :: new ( "tiktoken" , bytes) , & bytes, |b, bytes| {
206214 b. iter_batched (
207- || select_test_string ( & text, * bytes) ,
215+ || {
216+ let text = select_test_string ( & text, * bytes) ;
217+ assert ! ( tok. split( text) . nth( 1 ) . is_none( ) ) ;
218+ text
219+ } ,
208220 |text| tiktoken. encode_ordinary ( text) ,
209221 criterion:: BatchSize :: SmallInput ,
210222 )
@@ -214,7 +226,11 @@ fn worstcase_comparison_benchmark(c: &mut Criterion) {
214226 & bytes,
215227 |b, bytes| {
216228 b. iter_batched (
217- || select_test_string ( & text, * bytes) ,
229+ || {
230+ let text = select_test_string ( & text, * bytes) ;
231+ assert ! ( tok. split( text) . nth( 1 ) . is_none( ) ) ;
232+ text
233+ } ,
218234 |text| huggingface. encode_fast ( text, false ) . unwrap ( ) ,
219235 criterion:: BatchSize :: SmallInput ,
220236 )
0 commit comments