Skip to content
Merged
72 changes: 69 additions & 3 deletions crates/geo_filters/src/diff_count.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use crate::{Count, Diff};

mod bitvec;
mod config;
mod serde;
mod sim_hash;

use bitvec::*;
Expand Down Expand Up @@ -77,7 +78,7 @@ impl<C: GeoConfig<Diff>> std::fmt::Debug for GeoDiffCount<'_, C> {
}
}

impl<C: GeoConfig<Diff>> GeoDiffCount<'_, C> {
impl<'a, C: GeoConfig<Diff>> GeoDiffCount<'a, C> {
pub fn new(config: C) -> Self {
Self {
config,
Expand Down Expand Up @@ -208,16 +209,23 @@ impl<C: GeoConfig<Diff>> GeoDiffCount<'_, C> {
/// that makes the cost of the else case negligible.
fn xor_bit(&mut self, bucket: C::BucketType) {
if bucket.into_usize() < self.lsb.num_bits() {
// The bit being toggled is within our LSB bit vector
// so toggle it directly.
self.lsb.toggle(bucket.into_usize());
} else {
let msb = self.msb.to_mut();
match msb.binary_search_by(|k| bucket.cmp(k)) {
Ok(idx) => {
// The bit is already set in the MSB sparse bitset, remove it (XOR)
msb.remove(idx);

// We have removed a value from our MSB, move a value in the
// LSB into the MSB
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe reference here the invariance that the MSB vector must always have the first n bits represented.
By removing one we have to move another bit into the MSB vector if there is still one available in the LSB bitmap.

Copy link
Collaborator

@aneubeck aneubeck Jul 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a second invariance that the LSB bitmap is sized such that all bits smaller than the ones in the MSB vector fit into the LSB bitmap.

Copy link
Collaborator

@aneubeck aneubeck Jul 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh. Here is the bug:
We resize the LSB vector here if there are no LSB bits set. (at least I think that is what's happening here).
But in the case above where we toggle bits in the LSB vector directly, we are not checking for this case.
So, the correct solution is to remove this case here so that we always get the same representation independent of the order of operations being performed.

Can you write a test for this (it seems that we are unlucky with the randomized tests :) )

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened a PR to address this: #72

I didn't add a new test but I added a way of deterministically replaying fuzz tests. I'm not sure how we'd reliably get this to fail other than keep fuzzing until we find a failure and then hardcode the seed? Maybe by using a no-op hash function (identity function)?

let (first, second) = {
let mut lsb = iter_ones(self.lsb.bit_chunks().peekable());
(lsb.next(), lsb.next())
};

let new_smallest = if let Some(smallest) = first {
msb.push(C::BucketType::from_usize(smallest));
second.map(|_| smallest).unwrap_or(0)
Expand All @@ -229,15 +237,19 @@ impl<C: GeoConfig<Diff>> GeoDiffCount<'_, C> {
Err(idx) => {
msb.insert(idx, bucket);
if msb.len() > self.config.max_msb_len() {
// We have too many values in the MSB sparse index vector,
// let's move the smalles MSB value into the LSB bit vector
let smallest = msb
.pop()
.expect("we should have at least one element!")
.into_usize();
// ensure vector covers smallest

let new_smallest = msb
.last()
.expect("should have at least one element")
.into_usize();

// ensure LSB bit vector has the space for `smallest`
self.lsb.resize(new_smallest);
self.lsb.toggle(smallest);
}
Expand Down Expand Up @@ -360,7 +372,7 @@ impl<C: GeoConfig<Diff>> Count<Diff> for GeoDiffCount<'_, C> {
#[cfg(test)]
mod tests {
use itertools::Itertools;
use rand::{RngCore, SeedableRng};
use rand::{seq::IteratorRandom, RngCore, SeedableRng};

use crate::{
build_hasher::UnstableDefaultBuildHasher,
Expand Down Expand Up @@ -580,4 +592,58 @@ mod tests {
iter_ones(self.bit_chunks().peekable()).map(C::BucketType::from_usize)
}
}

#[test]
fn test_serialization_empty() {
let before = GeoDiffCount7::default();

let mut writer = vec![];
before.write(&mut writer).unwrap();

assert_eq!(writer.len(), 0);

let after = GeoDiffCount7::from_bytes(before.config.clone(), &writer);

assert_eq!(before, after);
}

// This helper exists in order to easily test serializing types with different
// bucket types in the MSB sparse bit field representation. See tests below.
fn serialization_round_trip<C: GeoConfig<Diff> + Default>() {
let mut rnd = rand::rngs::StdRng::from_os_rng();

// Run 100 simulations of random values being put into
// a diff counter. "Serializing" to a vector to emulate
// writing to a disk, and then deserializing and asserting
// the filters are equal.
for _ in 0..100 {
let mut before = GeoDiffCount::<'_, C>::default();

// Select a random number of items to insert
let items = (1..1000).choose(&mut rnd).unwrap();

for _ in 0..items {
before.push_hash(rnd.next_u64());
}

let mut writer = vec![];
before.write(&mut writer).unwrap();

let after = GeoDiffCount::<'_, C>::from_bytes(before.config.clone(), &writer);

assert_eq!(before, after);
}
}

#[test]
fn test_serialization_round_trip_7() {
// Uses a u16 for MSB buckets
serialization_round_trip::<GeoDiffConfig7>();
}

#[test]
fn test_serialization_round_trip_13() {
// Uses a u32 for MSB buckets
serialization_round_trip::<GeoDiffConfig7>();
}
}
62 changes: 60 additions & 2 deletions crates/geo_filters/src/diff_count/bitvec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ use std::borrow::Cow;
use std::cmp::Ordering;
use std::iter::Peekable;
use std::mem::{size_of, size_of_val};
use std::ops::{Index, Range};
use std::ops::{Deref as _, Index, Range};

use crate::config::BitChunk;
use crate::config::IsBucketType;
use crate::config::BITS_PER_BLOCK;
use crate::config::{BitChunk, BYTES_PER_BLOCK};

/// A bit vector where every bit occupies exactly one bit (in contrast to `Vec<bool>` where each
/// bit consumes 1 byte). It only implements the minimum number of operations that we need for our
Expand Down Expand Up @@ -81,6 +81,10 @@ impl BitVec<'_> {
self.num_bits
}

pub fn is_empty(&self) -> bool {
self.num_bits() == 0
}

/// Tests the bit specified by the provided zero-based bit position.
pub fn test_bit(&self, index: usize) -> bool {
assert!(index < self.num_bits);
Expand Down Expand Up @@ -142,6 +146,60 @@ impl BitVec<'_> {
let Self { num_bits, blocks } = self;
size_of_val(num_bits) + blocks.len() * size_of::<u64>()
}

pub fn from_bytes(mut buf: &[u8]) -> Self {
if buf.is_empty() {
return Self::default();
}

// The first byte of the serialized BitVec is used to indicate how many
// of the bits in the left-most byte are *unoccupied*.
// See [`BitVec::write`] implementation for how this is done.
assert!(
buf[0] < 64,
"Number of unoccupied bits should be <64, got {}",
buf[0]
);

let num_bits = (buf.len() - 1) * 8 - buf[0] as usize;
buf = &buf[1..];

assert_eq!(
buf.len() % BYTES_PER_BLOCK,
0,
"buffer should be a multiple of 8 bytes, got {}",
buf.len()
);

let blocks = unsafe {
std::mem::transmute(std::slice::from_raw_parts(
buf.as_ptr(),
buf.len() / BYTES_PER_BLOCK,
))
};
Copy link

Copilot AI Jul 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using std::mem::transmute with raw pointers can be unsafe and may cause undefined behavior if the memory layout assumptions are incorrect. Consider using safer alternatives like slice::align_to() or explicit casting with proper alignment checks.

Suggested change
};
let (prefix, blocks, suffix) = unsafe { buf.align_to::<u64>() };
assert!(
prefix.is_empty() && suffix.is_empty(),
"Buffer is not properly aligned for u64"
);

Copilot uses AI. Check for mistakes.
let blocks = Cow::Borrowed(blocks);

Self { num_bits, blocks }
}

pub fn write<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<usize> {
if self.is_empty() {
return Ok(0);
}

// First serialize the number of unoccupied bits in the last block as one byte.
let unoccupied_bits = 63 - ((self.num_bits - 1) % 64) as u8;
writer.write_all(&[unoccupied_bits])?;

let blocks = self.blocks.deref();
let block_bytes = unsafe {
std::slice::from_raw_parts(blocks.as_ptr() as *const u8, blocks.len() * BYTES_PER_BLOCK)
};

writer.write_all(block_bytes)?;

Ok(block_bytes.len() + 1)
}
}

impl Index<usize> for BitVec<'_> {
Expand Down
76 changes: 76 additions & 0 deletions crates/geo_filters/src/diff_count/serde.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
//! Convert a [`GeoDiffCount`] to and from byte arrays.
//!
//! Since most of our target platforms are little endian there are more optimised approaches
//! for little endian platforms, just splatting the bytes into the writer. This is contrary
//! to the usual "network endian" approach where big endian is the default, but most of our
//! consumers are little endian so it makes sense for this to be the optimal approach.
//!
//! For now we do not support big endian platforms. In the future we might add a big endian
//! platform specific implementation which is able to read the little endian serialized
//! representation. For now, if you attempt to serialize a filter on a big endian platform
//! you get a panic.

use std::{borrow::Cow, ops::Deref as _};

use crate::{config::GeoConfig, Diff};

use super::{bitvec::BitVec, GeoDiffCount};

impl<'a, C: GeoConfig<Diff>> GeoDiffCount<'a, C> {
/// Create a new [`GeoDiffCount`] from a slice of bytes
#[cfg(target_endian = "little")]
pub fn from_bytes(c: C, buf: &'a [u8]) -> Self {
if buf.is_empty() {
return Self::new(c);
}

// The number of most significant bits stores in the MSB sparse repr
let msb_len = (buf.len() / size_of::<C::BucketType>()).min(c.max_msb_len());

let msb =
unsafe { std::slice::from_raw_parts(buf.as_ptr() as *const C::BucketType, msb_len) };

// The number of bytes representing the MSB - this is how many bytes we need to
// skip over to reach the LSB
let msb_bytes_len = msb_len * size_of::<C::BucketType>();

Self {
config: c,
msb: Cow::Borrowed(msb),
lsb: BitVec::from_bytes(&buf[msb_bytes_len..]),
}
}

/// Create a new [`GeoDiffCount`] from a slice of bytes
#[cfg(target_endian = "big")]
pub fn from_bytes(c: C, buf: &'a [u8]) -> Self {
unimplemented!("not supported on big endian platforms")
}

#[cfg(target_endian = "little")]
pub fn write<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<usize> {
if self.msb.is_empty() {
return Ok(0);
}

let msb_buckets = self.msb.deref();
let msb_bytes = unsafe {
std::slice::from_raw_parts(
msb_buckets.as_ptr() as *const u8,
msb_buckets.len() * size_of::<C::BucketType>(),
)
};
writer.write_all(msb_bytes)?;

let mut bytes_written = msb_bytes.len();

bytes_written += self.lsb.write(writer)?;

Ok(bytes_written)
}

#[cfg(target_endian = "big")]
pub fn write<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<usize> {
unimplemented!("not supported on big endian platforms")
}
}
Loading