@@ -2,42 +2,41 @@ use std::sync::LazyLock;
22
33use bpe:: byte_pair_encoding:: BytePairEncoding ;
44use either:: Either ;
5- use fancy_regex:: Regex ;
5+ use regex_automata:: {
6+ meta:: { BuildError , Regex } ,
7+ util:: captures:: Captures ,
8+ Anchored , Input ,
9+ } ;
610
7- static BPE_R50K_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
8- let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_r50k_base.dict" ) ) ;
9- let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
10- let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\ p{L}+| ?\\ p{N}+| ?[^\\ s\\ p{L}\\ p{N}]+|\\ s+(?!\\ S)|\\ s+" ;
11- Tokenizer :: new ( bpe, Some ( pat) ) . expect ( "valid regex" )
12- } ) ;
13-
14- static BPE_P50K_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
15- let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_p50k_base.dict" ) ) ;
16- let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
17- let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\ p{L}+| ?\\ p{N}+| ?[^\\ s\\ p{L}\\ p{N}]+|\\ s+(?!\\ S)|\\ s+" ;
18- Tokenizer :: new ( bpe, Some ( pat) ) . expect ( "valid regex" )
19- } ) ;
11+ // Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead.
12+ // The look-ahead character is dropped from the match by the Pretokenizer iterator.
13+ // Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character!
2014
2115static BPE_CL100K_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
2216 let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_cl100k_base.dict" ) ) ;
2317 let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
24- let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}{1,3}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+(?!\\ S)|\\ s+" ;
25- Tokenizer :: new ( bpe, Some ( pat) ) . expect ( "valid regex" )
18+ let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}{1,3}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+$" ;
19+ let pat2 = "\\ s+\\ s" ;
20+ let pat3 = "\\ s+" ;
21+ Tokenizer :: new_lookahead ( bpe, & [ ( pat1, false ) , ( pat2, true ) , ( pat3, false ) ] )
22+ . expect ( "valid regex" )
2623} ) ;
2724
2825static BPE_O200K_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
2926 let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_o200k_base.dict" ) ) ;
3027 let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
31- let pat = [
28+ let pat1 = [
3229 "[^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}]*[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?" ,
3330 "[^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}]+[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?" ,
3431 "\\ p{N}{1,3}" ,
3532 " ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n/]*" ,
3633 "\\ s*[\\ r\\ n]+" ,
37- "\\ s+(?!\\ S)" ,
38- "\\ s+" ,
34+ "\\ s+$" ,
3935 ] . join ( "|" ) ;
40- Tokenizer :: new ( bpe, Some ( & pat) ) . expect ( "valid regex" )
36+ let pat2 = "\\ s+\\ s" ;
37+ let pat3 = "\\ s+" ;
38+ Tokenizer :: new_lookahead ( bpe, & [ ( & pat1, false ) , ( pat2, true ) , ( pat3, false ) ] )
39+ . expect ( "valid regex" )
4140} ) ;
4241
4342pub use bpe:: * ;
@@ -52,14 +51,33 @@ pub struct Tokenizer {
5251 /// The byte-pair encoding for this tokenizer.
5352 pub bpe : BytePairEncoding ,
5453 /// The pattern regex used to split the input.
55- pub pat : Option < Regex > ,
54+ pub pre : Option < Pretokenizer > ,
55+ }
56+
57+ pub struct Pretokenizer {
58+ /// The pattern regex used to split the input.
59+ pat : Regex ,
60+ /// For each pattern in the regex a boolean whether the last character is a look-ahead.
61+ lookahead : Vec < bool > ,
5662}
5763
5864impl Tokenizer {
65+ /// Build a tokenizer with an optional pretokenization regex pattern.
5966 #[ allow( clippy:: result_large_err) ]
60- pub fn new ( bpe : BytePairEncoding , pat : Option < & str > ) -> fancy_regex:: Result < Self > {
61- let pat = pat. map ( fancy_regex:: Regex :: new) . transpose ( ) ?;
62- Ok ( Self { bpe, pat } )
67+ pub fn new ( bpe : BytePairEncoding , pat : Option < & str > ) -> Result < Self , BuildError > {
68+ let pre = pat. map ( Pretokenizer :: new) . transpose ( ) ?;
69+ Ok ( Self { bpe, pre } )
70+ }
71+
72+ /// Build a tokenizer with pretokenization regex patterns. If the boolean for a pattern is true,
73+ /// the pattern is assumed to be a look-ahead pattern with exactly one look-ahead character!
74+ #[ allow( clippy:: result_large_err) ]
75+ pub fn new_lookahead (
76+ bpe : BytePairEncoding ,
77+ patterns : & [ ( & str , bool ) ] ,
78+ ) -> Result < Self , BuildError > {
79+ let pre = Some ( Pretokenizer :: new_lookahead ( patterns) ?) ;
80+ Ok ( Self { bpe, pre } )
6381 }
6482
6583 pub fn count ( & self , text : & str ) -> usize {
@@ -79,24 +97,81 @@ impl Tokenizer {
7997 }
8098
8199 pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & str > + ' a {
82- match & self . pat {
83- Some ( pat) => Either :: Left ( pat. find_iter ( text) . scan ( 0 , |start, m| {
84- let m = m. expect ( "match succeeded" ) ;
85- assert_eq ! ( * start, m. start( ) , "pattern should match all input text" ) ;
86- * start = m. end ( ) ;
87- Some ( m. as_str ( ) )
88- } ) ) ,
100+ match & self . pre {
101+ Some ( pre) => Either :: Left ( pre. split ( text) ) ,
89102 None => Either :: Right ( std:: iter:: once ( text) ) ,
90103 }
91104 }
92105}
93106
94- pub fn r50k_base ( ) -> & ' static Tokenizer {
95- & BPE_R50K_BASE
107+ impl Pretokenizer {
108+ /// Build a pretokenizer from the given regex pattern.
109+ #[ allow( clippy:: result_large_err) ]
110+ fn new ( pat : & str ) -> Result < Self , BuildError > {
111+ let pat = Regex :: new ( pat) ?;
112+ Ok ( Self {
113+ pat,
114+ lookahead : vec ! [ false ] ,
115+ } )
116+ }
117+
118+ /// Build a pretokenizer from the given regex patterns. If the boolean for a pattern is true,
119+ /// the pattern is assumed to be a look-ahead pattern with exactly one look-ahead character!
120+ #[ allow( clippy:: result_large_err) ]
121+ fn new_lookahead ( pats : & [ ( & str , bool ) ] ) -> Result < Self , BuildError > {
122+ let ( pats, lookahead) : ( Vec < _ > , _ ) = pats. iter ( ) . copied ( ) . unzip ( ) ;
123+ let pat = Regex :: new_many ( & pats) ?;
124+ Ok ( Self { pat, lookahead } )
125+ }
126+
127+ pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & str > + ' a {
128+ Splits {
129+ pat : & self . pat ,
130+ lookahead : & self . lookahead ,
131+ text,
132+ last : 0 ,
133+ caps : Captures :: matches ( self . pat . group_info ( ) . clone ( ) ) ,
134+ }
135+ }
136+ }
137+
138+ /// This is a small wrapper around the regex which emulates the behaviour of look-ahead by
139+ /// dropping the look-ahead character from the match. The assumption here is that the
140+ /// second pattern is always a look-ahead pattern, and that just a single character needs
141+ /// to be dropped. With this little hack, we can keep most of the regex patterns as they are,
142+ /// but achieve a >3x speedup.
143+ ///
144+ /// Alternatively, this could have been implemented with capture groups, but those were ~30%
145+ /// slower than this approach with multiple patterns.
146+ struct Splits < ' a > {
147+ pat : & ' a Regex ,
148+ lookahead : & ' a [ bool ] ,
149+ text : & ' a str ,
150+ last : usize ,
151+ caps : Captures ,
96152}
97153
98- pub fn p50k_base ( ) -> & ' static Tokenizer {
99- & BPE_P50K_BASE
154+ impl < ' a > Iterator for Splits < ' a > {
155+ type Item = & ' a str ;
156+
157+ fn next ( & mut self ) -> Option < Self :: Item > {
158+ let input = Input :: new ( & self . text [ self . last ..] ) . anchored ( Anchored :: Yes ) ;
159+ self . caps . clear ( ) ;
160+ self . pat . captures ( input, & mut self . caps ) ;
161+ let m = self . caps . get_match ( ) ?;
162+ let start = self . last ;
163+ let mut end = self . last + m. range ( ) . end ;
164+ if self . lookahead [ m. pattern ( ) . as_usize ( ) ] {
165+ let last = self . text [ start..end]
166+ . chars ( )
167+ . next_back ( )
168+ . expect ( "Expected at least a look-ahead character!" ) ;
169+ end -= last. len_utf8 ( ) ;
170+ assert_ne ! ( end, start, "a look-ahead pattern must ALWAYS consume at least one character excluding the look-ahead character!" ) ;
171+ }
172+ self . last = end;
173+ Some ( & self . text [ start..end] )
174+ }
100175}
101176
102177pub fn cl100k_base ( ) -> & ' static Tokenizer {
@@ -109,45 +184,31 @@ pub fn o200k_base() -> &'static Tokenizer {
109184
110185#[ cfg( test) ]
111186mod tests {
112- use tiktoken_rs:: cl100k_base_singleton;
187+ use bpe:: byte_pair_encoding:: { create_test_string, select_test_string} ;
188+ use tiktoken_rs:: { cl100k_base_singleton, o200k_base_singleton, CoreBPE } ;
113189
114190 use super :: * ;
115191
116192 #[ test]
117- fn can_load_r50k ( ) {
118- r50k_base ( ) . count ( "" ) ;
193+ fn test_cl100k ( ) {
194+ test_equivalence ( cl100k_base ( ) , & cl100k_base_singleton ( ) . lock ( ) ) ;
119195 }
120196
121197 #[ test]
122- fn can_load_p50k ( ) {
123- p50k_base ( ) . count ( "" ) ;
198+ fn test_o200k ( ) {
199+ test_equivalence ( o200k_base ( ) , & o200k_base_singleton ( ) . lock ( ) ) ;
124200 }
125201
126- #[ test]
127- fn can_load_cl100k ( ) {
128- cl100k_base ( ) . count ( "" ) ;
129- }
130-
131- #[ test]
132- fn can_load_o200k ( ) {
133- o200k_base ( ) . count ( "" ) ;
134- }
135-
136- /// Test demonstrating a case where input splitting makes a difference.
137- #[ test]
138- fn splitting_difference ( ) {
139- let text = "\" }\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\t memset Bahrain\" '; Griffify\t \t \t Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle" ;
140- let input = text. as_bytes ( ) ;
141- let expected: Vec < _ > = cl100k_base_singleton ( )
142- . lock ( )
143- . encode_ordinary ( text)
144- . into_iter ( )
145- . collect ( ) ;
146-
147- let without_splitting = BPE_CL100K_BASE . bpe . encode_via_backtracking ( input) ;
148- assert_ne ! ( without_splitting, expected) ;
149-
150- let with_splitting: Vec < _ > = BPE_CL100K_BASE . encode ( text) ;
151- assert_eq ! ( with_splitting, expected) ;
202+ #[ track_caller]
203+ fn test_equivalence ( tok : & Tokenizer , tiktoken : & CoreBPE ) {
204+ let text = create_test_string ( & tok. bpe , 80_000 ) ;
205+ for bytes in [ 10 , 100 , 1000 , 10_000 ] {
206+ for _ in 0 ..32 {
207+ let text = select_test_string ( & text, bytes) ;
208+ let tokens = tok. encode ( text) ;
209+ let tiktokens = tiktoken. encode_ordinary ( text) . to_vec ( ) ;
210+ assert_eq ! ( tokens, tiktokens, "encoding mismatch for {text:?}" ) ;
211+ }
212+ }
152213 }
153214}
0 commit comments