@@ -30,16 +30,16 @@ mod tests {
3030 /// This test produces the output for the encoding example in the README.
3131 #[ test]
3232 fn readme_example ( ) {
33- let tokens = [ "a" , "b" , "c" , "ab" , "cb" , "ac" ] . map ( |t| t . as_bytes ( ) . to_vec ( ) ) ;
34- let bpe = BytePairEncoding :: from_dictionary ( tokens, None ) ;
35- let text = "abacb " ;
33+ let tokens = [ "a" , "b" , "c" , "ab" , "cb" , "ac" , "bb" , "cbb" , "acbb" ] ;
34+ let bpe = BytePairEncoding :: from_dictionary ( tokens. map ( |t| t . as_bytes ( ) . to_vec ( ) ) , None ) ;
35+ let text = "abacbb " ;
3636 let prefixes = ( 1 ..=text. len ( ) ) . map ( |end| & text[ ..end] ) . collect_vec ( ) ;
3737 let all_prefix_tokens = prefixes
3838 . iter ( )
3939 . map ( |prefix| {
4040 bpe. encode_via_backtracking ( prefix. as_bytes ( ) )
4141 . into_iter ( )
42- . map ( |t| unsafe { String :: from_utf8_unchecked ( bpe. decode_tokens ( & [ t] ) ) } )
42+ . map ( |t| String :: from_utf8 ( bpe. decode_tokens ( & [ t] ) ) . unwrap ( ) )
4343 . collect_vec ( )
4444 } )
4545 . collect_vec ( ) ;
@@ -48,6 +48,8 @@ mod tests {
4848 . map ( |tokens| tokens. last ( ) . unwrap ( ) )
4949 . collect_vec ( ) ;
5050
51+ println ! ( "Token set: `{}`\n " , tokens. join( " " ) ) ;
52+
5153 println ! ( "All tokens for each prefix of `{text}`:\n " ) ;
5254 for ( prefix, tokens) in prefixes. iter ( ) . zip ( & all_prefix_tokens) {
5355 println ! (
@@ -67,7 +69,7 @@ mod tests {
6769 }
6870 println ! ( ) ;
6971
70- println ! ( "Tokenization of `{text}`:\n " ) ;
72+ println ! ( "Encoding using last tokens of `{text}`:\n " ) ;
7173 let mut remaining = text. len ( ) ;
7274 while remaining > 0 {
7375 let prefix = & text[ ..remaining] ;
0 commit comments