Skip to content

Commit e20fc1a

Browse files
Merge pull request #34 from github/move-equivalence-tests
Move equivalence testing to bpe-openai
2 parents 5b127c9 + b42989e commit e20fc1a

18 files changed

+403
-390
lines changed

crates/bpe-openai/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,11 @@ bench = false
1515
[dependencies]
1616
bpe = { version = "0.1.0", path = "../bpe" }
1717
either = "1.13"
18-
fancy-regex = "0.13"
18+
regex-automata = "0.4"
1919
rmp-serde = "1"
2020

2121
[dev-dependencies]
22+
bpe = { version = "0.1.0", path = "../bpe", features = ["rand"] }
2223
tiktoken-rs = "0.6"
2324

2425
[build-dependencies]

crates/bpe-openai/README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ For convencience it re-exports the `bpe` crate so that depending on this crate i
77

88
Supported tokenizers:
99

10-
- r50k
11-
- p50k
1210
- cl100k
1311
- o200k
1412

crates/bpe-openai/build.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ use bpe::byte_pair_encoding::{read_tiktoken, BytePairEncoding};
77
use serde::Serialize;
88

99
fn main() {
10-
serialize_tiktoken_bpe("r50k_base", include_bytes!("data/r50k_base.tiktoken.gz"), 1);
11-
serialize_tiktoken_bpe("p50k_base", include_bytes!("data/p50k_base.tiktoken.gz"), 1);
1210
serialize_tiktoken_bpe(
1311
"cl100k_base",
1412
include_bytes!("data/cl100k_base.tiktoken.gz"),
-359 KB
Binary file not shown.
-359 KB
Binary file not shown.

crates/bpe-openai/src/lib.rs

Lines changed: 127 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -2,42 +2,41 @@ use std::sync::LazyLock;
22

33
use bpe::byte_pair_encoding::BytePairEncoding;
44
use either::Either;
5-
use fancy_regex::Regex;
5+
use regex_automata::{
6+
meta::{BuildError, Regex},
7+
util::captures::Captures,
8+
Anchored, Input,
9+
};
610

7-
static BPE_R50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
8-
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k_base.dict"));
9-
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
10-
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
11-
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
12-
});
13-
14-
static BPE_P50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
15-
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k_base.dict"));
16-
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
17-
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
18-
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
19-
});
11+
// Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead.
12+
// The look-ahead character is dropped from the match by the Pretokenizer iterator.
13+
// Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character!
2014

2115
static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
2216
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k_base.dict"));
2317
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
24-
let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
25-
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
18+
let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
19+
let pat2 = "\\s+\\s";
20+
let pat3 = "\\s+";
21+
Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)])
22+
.expect("valid regex")
2623
});
2724

2825
static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
2926
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k_base.dict"));
3027
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
31-
let pat = [
28+
let pat1 = [
3229
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
3330
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
3431
"\\p{N}{1,3}",
3532
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
3633
"\\s*[\\r\\n]+",
37-
"\\s+(?!\\S)",
38-
"\\s+",
34+
"\\s+$",
3935
].join("|");
40-
Tokenizer::new(bpe, Some(&pat)).expect("valid regex")
36+
let pat2 = "\\s+\\s";
37+
let pat3 = "\\s+";
38+
Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)])
39+
.expect("valid regex")
4140
});
4241

4342
pub use bpe::*;
@@ -52,14 +51,33 @@ pub struct Tokenizer {
5251
/// The byte-pair encoding for this tokenizer.
5352
pub bpe: BytePairEncoding,
5453
/// The pattern regex used to split the input.
55-
pub pat: Option<Regex>,
54+
pub pre: Option<Pretokenizer>,
55+
}
56+
57+
pub struct Pretokenizer {
58+
/// The pattern regex used to split the input.
59+
pat: Regex,
60+
/// For each pattern in the regex a boolean whether the last character is a look-ahead.
61+
lookahead: Vec<bool>,
5662
}
5763

5864
impl Tokenizer {
65+
/// Build a tokenizer with an optional pretokenization regex pattern.
5966
#[allow(clippy::result_large_err)]
60-
pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> fancy_regex::Result<Self> {
61-
let pat = pat.map(fancy_regex::Regex::new).transpose()?;
62-
Ok(Self { bpe, pat })
67+
pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> Result<Self, BuildError> {
68+
let pre = pat.map(Pretokenizer::new).transpose()?;
69+
Ok(Self { bpe, pre })
70+
}
71+
72+
/// Build a tokenizer with pretokenization regex patterns. If the boolean for a pattern is true,
73+
/// the pattern is assumed to be a look-ahead pattern with exactly one look-ahead character!
74+
#[allow(clippy::result_large_err)]
75+
pub fn new_lookahead(
76+
bpe: BytePairEncoding,
77+
patterns: &[(&str, bool)],
78+
) -> Result<Self, BuildError> {
79+
let pre = Some(Pretokenizer::new_lookahead(patterns)?);
80+
Ok(Self { bpe, pre })
6381
}
6482

6583
pub fn count(&self, text: &str) -> usize {
@@ -79,24 +97,81 @@ impl Tokenizer {
7997
}
8098

8199
pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &str> + 'a {
82-
match &self.pat {
83-
Some(pat) => Either::Left(pat.find_iter(text).scan(0, |start, m| {
84-
let m = m.expect("match succeeded");
85-
assert_eq!(*start, m.start(), "pattern should match all input text");
86-
*start = m.end();
87-
Some(m.as_str())
88-
})),
100+
match &self.pre {
101+
Some(pre) => Either::Left(pre.split(text)),
89102
None => Either::Right(std::iter::once(text)),
90103
}
91104
}
92105
}
93106

94-
pub fn r50k_base() -> &'static Tokenizer {
95-
&BPE_R50K_BASE
107+
impl Pretokenizer {
108+
/// Build a pretokenizer from the given regex pattern.
109+
#[allow(clippy::result_large_err)]
110+
fn new(pat: &str) -> Result<Self, BuildError> {
111+
let pat = Regex::new(pat)?;
112+
Ok(Self {
113+
pat,
114+
lookahead: vec![false],
115+
})
116+
}
117+
118+
/// Build a pretokenizer from the given regex patterns. If the boolean for a pattern is true,
119+
/// the pattern is assumed to be a look-ahead pattern with exactly one look-ahead character!
120+
#[allow(clippy::result_large_err)]
121+
fn new_lookahead(pats: &[(&str, bool)]) -> Result<Self, BuildError> {
122+
let (pats, lookahead): (Vec<_>, _) = pats.iter().copied().unzip();
123+
let pat = Regex::new_many(&pats)?;
124+
Ok(Self { pat, lookahead })
125+
}
126+
127+
pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &str> + 'a {
128+
Splits {
129+
pat: &self.pat,
130+
lookahead: &self.lookahead,
131+
text,
132+
last: 0,
133+
caps: Captures::matches(self.pat.group_info().clone()),
134+
}
135+
}
136+
}
137+
138+
/// This is a small wrapper around the regex which emulates the behaviour of look-ahead by
139+
/// dropping the look-ahead character from the match. The assumption here is that the
140+
/// second pattern is always a look-ahead pattern, and that just a single character needs
141+
/// to be dropped. With this little hack, we can keep most of the regex patterns as they are,
142+
/// but achieve a >3x speedup.
143+
///
144+
/// Alternatively, this could have been implemented with capture groups, but those were ~30%
145+
/// slower than this approach with multiple patterns.
146+
struct Splits<'a> {
147+
pat: &'a Regex,
148+
lookahead: &'a [bool],
149+
text: &'a str,
150+
last: usize,
151+
caps: Captures,
96152
}
97153

98-
pub fn p50k_base() -> &'static Tokenizer {
99-
&BPE_P50K_BASE
154+
impl<'a> Iterator for Splits<'a> {
155+
type Item = &'a str;
156+
157+
fn next(&mut self) -> Option<Self::Item> {
158+
let input = Input::new(&self.text[self.last..]).anchored(Anchored::Yes);
159+
self.caps.clear();
160+
self.pat.captures(input, &mut self.caps);
161+
let m = self.caps.get_match()?;
162+
let start = self.last;
163+
let mut end = self.last + m.range().end;
164+
if self.lookahead[m.pattern().as_usize()] {
165+
let last = self.text[start..end]
166+
.chars()
167+
.next_back()
168+
.expect("Expected at least a look-ahead character!");
169+
end -= last.len_utf8();
170+
assert_ne!(end, start, "a look-ahead pattern must ALWAYS consume at least one character excluding the look-ahead character!");
171+
}
172+
self.last = end;
173+
Some(&self.text[start..end])
174+
}
100175
}
101176

102177
pub fn cl100k_base() -> &'static Tokenizer {
@@ -109,45 +184,31 @@ pub fn o200k_base() -> &'static Tokenizer {
109184

110185
#[cfg(test)]
111186
mod tests {
112-
use tiktoken_rs::cl100k_base_singleton;
187+
use bpe::byte_pair_encoding::{create_test_string, select_test_string};
188+
use tiktoken_rs::{cl100k_base_singleton, o200k_base_singleton, CoreBPE};
113189

114190
use super::*;
115191

116192
#[test]
117-
fn can_load_r50k() {
118-
r50k_base().count("");
193+
fn test_cl100k() {
194+
test_equivalence(cl100k_base(), &cl100k_base_singleton().lock());
119195
}
120196

121197
#[test]
122-
fn can_load_p50k() {
123-
p50k_base().count("");
198+
fn test_o200k() {
199+
test_equivalence(o200k_base(), &o200k_base_singleton().lock());
124200
}
125201

126-
#[test]
127-
fn can_load_cl100k() {
128-
cl100k_base().count("");
129-
}
130-
131-
#[test]
132-
fn can_load_o200k() {
133-
o200k_base().count("");
134-
}
135-
136-
/// Test demonstrating a case where input splitting makes a difference.
137-
#[test]
138-
fn splitting_difference() {
139-
let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle";
140-
let input = text.as_bytes();
141-
let expected: Vec<_> = cl100k_base_singleton()
142-
.lock()
143-
.encode_ordinary(text)
144-
.into_iter()
145-
.collect();
146-
147-
let without_splitting = BPE_CL100K_BASE.bpe.encode_via_backtracking(input);
148-
assert_ne!(without_splitting, expected);
149-
150-
let with_splitting: Vec<_> = BPE_CL100K_BASE.encode(text);
151-
assert_eq!(with_splitting, expected);
202+
#[track_caller]
203+
fn test_equivalence(tok: &Tokenizer, tiktoken: &CoreBPE) {
204+
let text = create_test_string(&tok.bpe, 80_000);
205+
for bytes in [10, 100, 1000, 10_000] {
206+
for _ in 0..32 {
207+
let text = select_test_string(&text, bytes);
208+
let tokens = tok.encode(text);
209+
let tiktokens = tiktoken.encode_ordinary(text).to_vec();
210+
assert_eq!(tokens, tiktokens, "encoding mismatch for {text:?}");
211+
}
212+
}
152213
}
153214
}

crates/bpe/README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,10 @@ It does give a good indication of how the algorithms might perform in practice.
283283

284284
The graph below shows encoding runtime vs slice length.
285285
All encoders show a similar runtime complexity.
286-
The backtracking encoder and tiktoken have comparable performance, and both are about 3.5--4x faster than the Huggingface encoder.
286+
The backtracking encoder is about 3x faster than tiktoken.
287+
This can mainly be attributed to optimizations in the pre-tokenization that allowed us to use a faster regex engine.
288+
Without those, their performance is comparable.
289+
The backtracking encoder is about 10x faster than the Huggingface encoder.
287290

288291
An interesting observation here is that pre-tokenization slows down encoding quite a bit.
289292
Compared with the encoding benchmark above, the backtracking encoder without pre-tokenization is almost 4x faster than the one with pre-tokenization in this benchmark.

crates/bpe/benchmarks/equivalence.rs

Lines changed: 21 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
1+
use bpe::byte_pair_encoding::{create_test_string, select_test_string};
12
use bpe_benchmarks::*;
23

34
#[cfg(test)]
45
const N: usize = 32;
56

67
#[test]
7-
fn test_encoding_equivalence_without_pretokenization() {
8+
fn test_huggingface_encoding_equivalence_without_pretokenization() {
89
for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
910
let huggingface = without_pretokenizer(huggingface);
10-
let text = create_test_string(&bpe.bpe, 20000);
11-
let inputs = (0..N)
12-
.map(|_| select_test_bytes(text.as_bytes(), 100))
11+
let text = create_test_string(&bpe.bpe, 80_000);
12+
let texts = (0..N)
13+
.map(|_| select_test_string(&text, 100))
1314
.chain(std::iter::once(
14-
"You should see the Greek word 'kosme': \"κόσμε\"".as_bytes(),
15+
"You should see the Greek word 'kosme': \"κόσμε\"",
1516
));
16-
for input in inputs {
17-
let text = std::str::from_utf8(input).unwrap();
18-
let out = bpe.bpe.encode_via_backtracking(input);
17+
for text in texts {
18+
let out = bpe.bpe.encode_via_backtracking(text.as_bytes());
1919
let huggingface_out = huggingface
2020
.encode_fast(text, false)
2121
.unwrap()
@@ -41,48 +41,35 @@ fn test_encoding_equivalence_without_pretokenization() {
4141
}
4242

4343
#[test]
44-
fn test_encoding_equivalence_with_pretokenization() {
45-
for (_, bpe, tiktoken, huggingface) in TOKENIZERS.iter() {
46-
let text = create_test_string(&bpe.bpe, 20000);
47-
let inputs = (0..N)
48-
.map(|_| select_test_bytes(text.as_bytes(), 100))
44+
fn test_huggingface_encoding_equivalence_with_pretokenization() {
45+
for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
46+
let text = create_test_string(&bpe.bpe, 80_000);
47+
let texts = (0..N)
48+
.map(|_| select_test_string(&text, 100))
4949
.chain(std::iter::once(
50-
"You should see the Greek word 'kosme': \"κόσμε\"".as_bytes(),
50+
"You should see the Greek word 'kosme': \"κόσμε\" ",
5151
));
52-
for input in inputs {
53-
let text = std::str::from_utf8(input).unwrap();
52+
for text in texts {
5453
let out = bpe.encode(text);
55-
let tiktoken_out = tiktoken.encode_ordinary(text);
56-
let tiktoken_out2 = tiktoken_out.to_vec();
57-
let tiktoken_text = tiktoken.decode(tiktoken_out.clone()).unwrap();
5854
let huggingface_out = huggingface
5955
.encode_fast(text, false)
6056
.unwrap()
6157
.get_ids()
6258
.to_vec();
63-
if tiktoken_out2 != huggingface_out {
59+
60+
if huggingface_out != out {
61+
let text = bpe.decode(&out).unwrap();
6462
let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
65-
if tiktoken_text != huggingface_text {
63+
if huggingface_text != text {
6664
panic!(
6765
"huggingface tokens and text differ: {:?} != {:?}",
68-
huggingface_text, tiktoken_text
66+
text, huggingface_text
6967
);
7068
} else {
7169
panic!(
7270
"huggingface tokens differ: {:?} != {:?}",
73-
huggingface_out, tiktoken_out2
74-
);
75-
}
76-
}
77-
if tiktoken_out2 != out {
78-
let text = bpe.decode(&out).unwrap();
79-
if tiktoken_text != text {
80-
panic!(
81-
"bpe tokens and text differ: {:?} != {:?}",
82-
text, tiktoken_text
71+
out, huggingface_out
8372
);
84-
} else {
85-
panic!("bpe tokens differ: {:?} != {:?}", out, tiktoken_out2);
8673
}
8774
}
8875
}

0 commit comments

Comments
 (0)