Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/bpe-openai/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ bench = false
[dependencies]
bpe = { version = "0.1.0", path = "../bpe" }
either = "1.13"
fancy-regex = "0.13"
regex-automata = "0.4"
rmp-serde = "1"

[dev-dependencies]
Expand Down
128 changes: 110 additions & 18 deletions crates/bpe-openai/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ use std::sync::LazyLock;

use bpe::byte_pair_encoding::BytePairEncoding;
use either::Either;
use fancy_regex::Regex;
use regex_automata::{
meta::{BuildError, Regex},
util::captures::Captures,
Anchored, Input,
};

// Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead.
// The look-ahead character is dropped from the match by the Pretokenizer iterator.
Expand All @@ -11,23 +15,28 @@ use fancy_regex::Regex;
static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k_base.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
let pat2 = "\\s+\\s";
let pat3 = "\\s+";
Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)])
.expect("valid regex")
});

static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k_base.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat = [
let pat1 = [
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
"\\p{N}{1,3}",
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
"\\s*[\\r\\n]+",
"\\s+(?!\\S)",
"\\s+",
"\\s+$",
].join("|");
Tokenizer::new(bpe, Some(&pat)).expect("valid regex")
let pat2 = "\\s+\\s";
let pat3 = "\\s+";
Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)])
.expect("valid regex")
});

pub use bpe::*;
Expand All @@ -42,15 +51,33 @@ pub struct Tokenizer {
/// The byte-pair encoding for this tokenizer.
pub bpe: BytePairEncoding,
/// The pattern regex used to split the input.
pub pat: Option<Regex>,
pub pre: Option<Pretokenizer>,
}

pub struct Pretokenizer {
/// The pattern regex used to split the input.
pat: Regex,
/// For each pattern in the regex a boolean whether the last character is a look-ahead.
lookahead: Vec<bool>,
}

impl Tokenizer {
/// Build a tokenizer with an optional pretokenization regex pattern.
#[allow(clippy::result_large_err)]
pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> fancy_regex::Result<Self> {
let pat = pat.map(fancy_regex::Regex::new).transpose()?;
Ok(Self { bpe, pat })
pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> Result<Self, BuildError> {
let pre = pat.map(Pretokenizer::new).transpose()?;
Ok(Self { bpe, pre })
}

/// Build a tokenizer with pretokenization regex patterns. If the boolean for a pattern is true,
/// the pattern is assumed to be a look-ahead pattern with exactly one look-ahead character!
#[allow(clippy::result_large_err)]
pub fn new_lookahead(
bpe: BytePairEncoding,
patterns: &[(&str, bool)],
) -> Result<Self, BuildError> {
let pre = Some(Pretokenizer::new_lookahead(patterns)?);
Ok(Self { bpe, pre })
}

pub fn count(&self, text: &str) -> usize {
Expand All @@ -70,18 +97,83 @@ impl Tokenizer {
}

pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &str> + 'a {
match &self.pat {
Some(pat) => Either::Left(pat.find_iter(text).scan(0, |start, m| {
let m = m.expect("match succeeded");
assert_eq!(*start, m.start(), "pattern should match all input text");
*start = m.end();
Some(m.as_str())
})),
match &self.pre {
Some(pre) => Either::Left(pre.split(text)),
None => Either::Right(std::iter::once(text)),
}
}
}

impl Pretokenizer {
/// Build a pretokenizer from the given regex pattern.
#[allow(clippy::result_large_err)]
fn new(pat: &str) -> Result<Self, BuildError> {
let pat = Regex::new(pat)?;
Ok(Self {
pat,
lookahead: vec![false],
})
}

/// Build a pretokenizer from the given regex patterns. If the boolean for a pattern is true,
/// the pattern is assumed to be a look-ahead pattern with exactly one look-ahead character!
#[allow(clippy::result_large_err)]
fn new_lookahead(pats: &[(&str, bool)]) -> Result<Self, BuildError> {
let (pats, lookahead): (Vec<_>, _) = pats.iter().copied().unzip();
let pat = Regex::new_many(&pats)?;
Ok(Self { pat, lookahead })
}

pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &str> + 'a {
Splits {
pat: &self.pat,
lookahead: &self.lookahead,
text,
last: 0,
caps: Captures::matches(self.pat.group_info().clone()),
}
}
}

/// This is a small wrapper around the regex which emulates the behaviour of look-ahead by
/// dropping the look-ahead character from the match. The assumption here is that the
/// second pattern is always a look-ahead pattern, and that just a single character needs
/// to be dropped. With this little hack, we can keep most of the regex patterns as they are,
/// but achieve a >3x speedup.
///
/// Alternatively, this could have been implemented with capture groups, but those were ~30%
/// slower than this approach with multiple patterns.
struct Splits<'a> {
pat: &'a Regex,
lookahead: &'a [bool],
text: &'a str,
last: usize,
caps: Captures,
}

impl<'a> Iterator for Splits<'a> {
type Item = &'a str;

fn next(&mut self) -> Option<Self::Item> {
let input = Input::new(&self.text[self.last..]).anchored(Anchored::Yes);
self.caps.clear();
self.pat.captures(input, &mut self.caps);
let m = self.caps.get_match()?;
let start = self.last;
let mut end = self.last + m.range().end;
if self.lookahead[m.pattern().as_usize()] {
let last = self.text[start..end]
.chars()
.next_back()
.expect("Expected at least a look-ahead character!");
end -= last.len_utf8();
assert_ne!(end, start, "a look-ahead pattern must ALWAYS consume at least one character excluding the look-ahead character!");
}
self.last = end;
Some(&self.text[start..end])
}
}

pub fn cl100k_base() -> &'static Tokenizer {
&BPE_CL100K_BASE
}
Expand Down
5 changes: 4 additions & 1 deletion crates/bpe/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,10 @@ It does give a good indication of how the algorithms might perform in practice.

The graph below shows encoding runtime vs slice length.
All encoders show a similar runtime complexity.
The backtracking encoder and tiktoken have comparable performance, and both are about 3.5--4x faster than the Huggingface encoder.
The backtracking encoder is about 3x faster than tiktoken.
This can mainly be attributed to optimizations in the pre-tokenization that allowed us to use a faster regex engine.
Without those, their performance is comparable.
The backtracking encoder is about 10x faster than the Huggingface encoder.

An interesting observation here is that pre-tokenization slows down encoding quite a bit.
Compared with the encoding benchmark above, the backtracking encoder without pre-tokenization is almost 4x faster than the one with pre-tokenization in this benchmark.
Expand Down
2 changes: 1 addition & 1 deletion crates/bpe/benchmarks/equivalence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ fn test_huggingface_encoding_equivalence_with_pretokenization() {
let texts = (0..N)
.map(|_| select_test_string(&text, 100))
.chain(std::iter::once(
"You should see the Greek word 'kosme': \"κόσμε\"",
"You should see the Greek word 'kosme': \"κόσμε\" ",
));
for text in texts {
let out = bpe.encode(text);
Expand Down
20 changes: 10 additions & 10 deletions crates/bpe/images/performance-appending.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading