github · aneubeck · Oct 21, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
diff --git a/crates/bpe-openai/Cargo.toml b/crates/bpe-openai/Cargo.toml
@@ -15,7 +15,7 @@ bench = false
 [dependencies]
 bpe = { version = "0.1.0", path = "../bpe" }
 either = "1.13"
-fancy-regex = "0.13"
+regex-automata = "0.4"
 rmp-serde = "1"
 
 [dev-dependencies]

diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs
@@ -2,7 +2,11 @@ use std::sync::LazyLock;
 
 use bpe::byte_pair_encoding::BytePairEncoding;
 use either::Either;
-use fancy_regex::Regex;
+use regex_automata::{
+    meta::{BuildError, Regex},
+    util::captures::Captures,
+    Anchored, Input,
+};
 
 // Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead.
 // The look-ahead character is dropped from the match by the Pretokenizer iterator.
@@ -11,23 +15,28 @@ use fancy_regex::Regex;
 static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k_base.dict"));
     let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
-    let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
-    Tokenizer::new(bpe, Some(pat)).expect("valid regex")
+    let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
+    let pat2 = "\\s+\\s";
+    let pat3 = "\\s+";
+    Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)])
+        .expect("valid regex")
 });
 
 static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k_base.dict"));
     let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
-    let pat = [
+    let pat1 = [
         "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
         "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
         "\\p{N}{1,3}",
         " ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
         "\\s*[\\r\\n]+",
-        "\\s+(?!\\S)",
-        "\\s+",
+        "\\s+$",
     ].join("|");
-    Tokenizer::new(bpe, Some(&pat)).expect("valid regex")
+    let pat2 = "\\s+\\s";
+    let pat3 = "\\s+";
+    Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)])
+        .expect("valid regex")
 });
 
 pub use bpe::*;
@@ -42,15 +51,33 @@ pub struct Tokenizer {
     /// The byte-pair encoding for this tokenizer.
     pub bpe: BytePairEncoding,
     /// The pattern regex used to split the input.
-    pub pat: Option<Regex>,
+    pub pre: Option<Pretokenizer>,
+}
+
+pub struct Pretokenizer {
+    /// The pattern regex used to split the input.
+    pat: Regex,
+    /// For each pattern in the regex a boolean whether the last character is a look-ahead.
+    lookahead: Vec<bool>,
 }
 
 impl Tokenizer {
     /// Build a tokenizer with an optional pretokenization regex pattern.
     #[allow(clippy::result_large_err)]
-    pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> fancy_regex::Result<Self> {
-        let pat = pat.map(fancy_regex::Regex::new).transpose()?;
-        Ok(Self { bpe, pat })
+    pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> Result<Self, BuildError> {
+        let pre = pat.map(Pretokenizer::new).transpose()?;
+        Ok(Self { bpe, pre })
+    }
+
+    /// Build a tokenizer with pretokenization regex patterns. If the boolean for a pattern is true,
+    /// the pattern is assumed to be a look-ahead pattern with exactly one look-ahead character!
+    #[allow(clippy::result_large_err)]
+    pub fn new_lookahead(
+        bpe: BytePairEncoding,
+        patterns: &[(&str, bool)],
+    ) -> Result<Self, BuildError> {
+        let pre = Some(Pretokenizer::new_lookahead(patterns)?);
+        Ok(Self { bpe, pre })
     }
 
     pub fn count(&self, text: &str) -> usize {
@@ -70,18 +97,83 @@ impl Tokenizer {
     }
 
     pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &str> + 'a {
-        match &self.pat {
-            Some(pat) => Either::Left(pat.find_iter(text).scan(0, |start, m| {
-                let m = m.expect("match succeeded");
-                assert_eq!(*start, m.start(), "pattern should match all input text");
-                *start = m.end();
-                Some(m.as_str())
-            })),
+        match &self.pre {
+            Some(pre) => Either::Left(pre.split(text)),
             None => Either::Right(std::iter::once(text)),
         }
     }
 }
 
+impl Pretokenizer {
+    /// Build a pretokenizer from the given regex pattern.
+    #[allow(clippy::result_large_err)]
+    fn new(pat: &str) -> Result<Self, BuildError> {
+        let pat = Regex::new(pat)?;
+        Ok(Self {
+            pat,
+            lookahead: vec![false],
+        })
+    }
+
+    /// Build a pretokenizer from the given regex patterns. If the boolean for a pattern is true,
+    /// the pattern is assumed to be a look-ahead pattern with exactly one look-ahead character!
+    #[allow(clippy::result_large_err)]
+    fn new_lookahead(pats: &[(&str, bool)]) -> Result<Self, BuildError> {
+        let (pats, lookahead): (Vec<_>, _) = pats.iter().copied().unzip();
+        let pat = Regex::new_many(&pats)?;
+        Ok(Self { pat, lookahead })
+    }
+
+    pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &str> + 'a {
+        Splits {
+            pat: &self.pat,
+            lookahead: &self.lookahead,
+            text,
+            last: 0,
+            caps: Captures::matches(self.pat.group_info().clone()),
+        }
+    }
+}
+
+/// This is a small wrapper around the regex which emulates the behaviour of look-ahead by
+/// dropping the look-ahead character from the match. The assumption here is that the
+/// second pattern is always a look-ahead pattern, and that just a single character needs
+/// to be dropped. With this little hack, we can keep most of the regex patterns as they are,
+/// but achieve a >3x speedup.
+///
+/// Alternatively, this could have been implemented with capture groups, but those were ~30%
+/// slower than this approach with multiple patterns.
+struct Splits<'a> {
+    pat: &'a Regex,
+    lookahead: &'a [bool],
+    text: &'a str,
+    last: usize,
+    caps: Captures,
+}
+
+impl<'a> Iterator for Splits<'a> {
+    type Item = &'a str;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let input = Input::new(&self.text[self.last..]).anchored(Anchored::Yes);
+        self.caps.clear();
+        self.pat.captures(input, &mut self.caps);
+        let m = self.caps.get_match()?;
+        let start = self.last;
+        let mut end = self.last + m.range().end;
+        if self.lookahead[m.pattern().as_usize()] {
+            let last = self.text[start..end]
+                .chars()
+                .next_back()
+                .expect("Expected at least a look-ahead character!");
+            end -= last.len_utf8();
+            assert_ne!(end, start, "a look-ahead pattern must ALWAYS consume at least one character excluding the look-ahead character!");
+        }
+        self.last = end;
+        Some(&self.text[start..end])
+    }
+}
+
 pub fn cl100k_base() -> &'static Tokenizer {
     &BPE_CL100K_BASE
 }

diff --git a/crates/bpe/README.md b/crates/bpe/README.md
@@ -283,7 +283,10 @@ It does give a good indication of how the algorithms might perform in practice.
 
 The graph below shows encoding runtime vs slice length.
 All encoders show a similar runtime complexity.
-The backtracking encoder and tiktoken have comparable performance, and both are about 3.5--4x faster than the Huggingface encoder.
+The backtracking encoder is about 3x faster than tiktoken.
+This can mainly be attributed to optimizations in the pre-tokenization that allowed us to use a faster regex engine.
+Without those, their performance is comparable.
+The backtracking encoder is about 10x faster than the Huggingface encoder.
 
 An interesting observation here is that pre-tokenization slows down encoding quite a bit.
 Compared with the encoding benchmark above, the backtracking encoder without pre-tokenization is almost 4x faster than the one with pre-tokenization in this benchmark.

diff --git a/crates/bpe/benchmarks/equivalence.rs b/crates/bpe/benchmarks/equivalence.rs
@@ -47,7 +47,7 @@ fn test_huggingface_encoding_equivalence_with_pretokenization() {
         let texts = (0..N)
             .map(|_| select_test_string(&text, 100))
             .chain(std::iter::once(
-                "You should see the Greek word 'kosme':       \"κόσμε\"",
+                "You should see the Greek word 'kosme':       \"κόσμε\"   ",
             ));
         for text in texts {
             let out = bpe.encode(text);

diff --git a/crates/bpe/images/performance-appending.svg b/crates/bpe/images/performance-appending.svg