@@ -80,12 +80,17 @@ impl Tokenizer {
8080 Ok ( Self { bpe, pre } )
8181 }
8282
83+ /// Count the number of tokens produced when encoding the text. Applies pre-tokenization
84+ /// before counting.
8385 pub fn count ( & self , text : & str ) -> usize {
8486 self . split ( text)
8587 . map ( |piece| self . bpe . count ( piece. as_bytes ( ) ) )
8688 . sum ( )
8789 }
8890
91+ /// Returns the token count iff the total token count stays below the specified token_limit.
92+ /// Otherwise, it returns none. This function can be faster than [`Self::count`]` when the
93+ /// token limit is much smaller than the provided text. Applies pre-tokenization before counting.
8994 pub fn count_till_limit ( & self , text : & str , token_limit : usize ) -> Option < usize > {
9095 self . split ( text)
9196 . try_fold ( token_limit, |token_limit, piece| {
@@ -95,16 +100,21 @@ impl Tokenizer {
95100 } )
96101 }
97102
103+ /// Returns the tokens for the encoding of the given text. Applies pre-tokenization before
104+ /// encoding.
98105 pub fn encode ( & self , text : & str ) -> Vec < u32 > {
99106 self . split ( text)
100107 . flat_map ( |piece| self . bpe . encode_via_backtracking ( piece. as_bytes ( ) ) )
101108 . collect ( )
102109 }
103-
110+ /// Returns the text corresponding to the given encoding if it is valid UTF-8. Otherwise,
111+ /// returns none.
104112 pub fn decode ( & self , tokens : & [ u32 ] ) -> Option < String > {
105113 String :: from_utf8 ( self . bpe . decode_tokens ( tokens) ) . ok ( )
106114 }
107115
116+ /// Returns an iterator with the text pieces resulting from pre-tokenization. If this
117+ /// tokenizer does not have pre-tokenization, the iterator returns the full text.
108118 pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & str > + ' a {
109119 match & self . pre {
110120 Some ( pre) => Either :: Left ( pre. split ( text) ) ,
@@ -133,6 +143,7 @@ impl Pretokenizer {
133143 Ok ( Self { pat, lookahead } )
134144 }
135145
146+ /// Returns an iterator with the text pieces after splitting with the regular expression.
136147 pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & str > + ' a {
137148 Splits {
138149 pat : & self . pat ,
0 commit comments