diff --git a/crates/geo_filters/docs/choosing-a-hash-function.md b/crates/geo_filters/docs/choosing-a-hash-function.md new file mode 100644 index 0000000..5115eeb --- /dev/null +++ b/crates/geo_filters/docs/choosing-a-hash-function.md @@ -0,0 +1,115 @@ +# Choosing a hash function + +## Reproducibility + +This library uses hash functions to assign values to buckets deterministically. The same item +will hash to the same value, and modify the same bit in the geofilter. + +When comparing geofilters it is important that the same hash functions, using the same seed +values, have been used for *both* filters. Attempting to compare geofilters which have been +produced using different hash functions or the same hash function with different seeds will +produce nonsensical results. + +Similar to the Rust standard library, this crate uses the `BuildHasher` trait and creates +a new `Hasher` for every item processed. + +To help prevent mistakes caused by mismatching hash functions or seeds we introduce a trait +`ReproducibleBuildHasher` which you must implement if you wish to use a custom hashing function. +By marking a `BuildHasher` with this trait you're asserting that `Hasher`s produced using +`Default::default` will hash identical items to the same `u64` value across multiple calls +to `BuildHasher::hash_one`. + +The following is an example of some incorrect code which produces nonsense results: + +```rust +use std::hash::RandomState; + +// Implement our marker trait for `RandomState`. +// You should _NOT_ do this as `RandomState::default` does not produce +// reproducible hashers. +impl ReproducibleBuildHasher for RandomState {} + +#[test] +fn test_different_hash_functions() { + // The last parameter in this FixedConfig means we're using RandomState as the BuildHasher + pub type FixedConfigRandom = FixedConfig; + + let mut a = GeoDiffCount::new(FixedConfigRandom::default()); + let mut b = GeoDiffCount::new(FixedConfigRandom::default()); + + // Add our values + for n in 0..100 { + a.push(n); + b.push(n); + } + + // We have inserted the same items into both filters so we'd expect the + // symmetric difference to be zero if all is well. + let diff_size = a.size_with_sketch(&b); + + // But all is not well. This assertion fails! + assert_eq!(diff_size, 0.0); +} +``` + +The actual value returned in this example is ~200. This makes sense because the geofilter +thinks that there are 100 unique values in each of the filters, so the difference is approximated +as being ~200. If we were to rerun the above example with a genuinely reproducible `BuildHasher` +then the resulting diff size would be `0`. + +In debug builds, as part of the config's `eq` implementation, our library will assert that the `BuildHasher`s +produce the same `u64` value when given the same input but this is not enabled in release builds. + +## Stability + +Following from this, it might be important that your hash functions and seed values are stable, meaning, +that they won't change from one release to another. + +The default function provided in this library is *NOT* stable as it is based on the Rust standard libraries +`DefaultHasher` which does not have a specified algorithm and may change across releases of Rust. + +Stability is especially important to consider if you are using serialized geofilters which may have +been created in a previous version of the Rust standard library. + +This library provides an implementation of `ReproducibleBuildHasher` for the `FnvBuildHasher` provided +by the `fnv` crate version `1.0`. This is a _stable_ hash function in that it won't change unexpectedly +but it doesn't have good diffusion properties. This means if your input items have low entropy (for +example numbers from `0..10000`) you will find that the geofilter is not able to produce accurate estimations. + +## Uniformity and Diffusion + +In order to produce accurate estimations it is important that your hash function is able to produce evenly +distributed outputs for your input items. + +This property must be balanced against the performance requirements of your system as stronger hashing +algorithms are often slower. + +Depending on your input data, different functions may be more or less appropriate. For example, if your input +items have high entropy (e.g. SHA256 values) then the diffusion of your hash function might matter less. + +## Implementing your own `ReproducibleBuildHasher` type + +If you are using a hash function that you have not implemented yourself you will not be able to implement +`ReproducibleBuildHasher` on that type directly due to Rust's orphan rules. The easiest way to get around this +is to create a newtype which proxies the underlying `BuildHasher`. + +In addition to `BuildHasher` `ReproducibleBuildHasher` needs `Default` and `Clone`, which is usually implemented +on `BuildHasher`s, so you can probably just `#[derive(...)]` those. If your `BuildHasher` doesn't have those +traits then you may need to implement them manually. + +Here is an example of how to use new types to mark your `BuildHasher` as reproducible. + +```rust +#[derive(Clone, Default)] +pub struct MyBuildHasher(BuildHasherDefault); + +impl BuildHasher for MyBuildHasher { + type Hasher = DefaultHasher; + + fn build_hasher(&self) -> Self::Hasher { + self.0.build_hasher() + } +} + +impl ReproducibleBuildHasher for MyBuildHasher {} +``` diff --git a/crates/geo_filters/evaluation/accuracy.rs b/crates/geo_filters/evaluation/accuracy.rs index c3151b7..6248f07 100644 --- a/crates/geo_filters/evaluation/accuracy.rs +++ b/crates/geo_filters/evaluation/accuracy.rs @@ -2,6 +2,7 @@ use std::fs::File; use std::path::PathBuf; use clap::Parser; +use geo_filters::build_hasher::UnstableDefaultBuildHasher; use geo_filters::config::VariableConfig; use itertools::Itertools; use once_cell::sync::Lazy; @@ -156,19 +157,22 @@ static SIMULATION_CONFIG_FROM_STR: Lazy> = Lazy::new let [b, bytes, msb] = capture_usizes(&c, [2, 3, 4]); match t { BucketType::U8 => { - let c = VariableConfig::<_, u8>::new(b, bytes, msb); + let c = VariableConfig::<_, u8, UnstableDefaultBuildHasher>::new(b, bytes, msb); Box::new(move || Box::new(GeoDiffCount::new(c.clone()))) } BucketType::U16 => { - let c = VariableConfig::<_, u16>::new(b, bytes, msb); + let c = + VariableConfig::<_, u16, UnstableDefaultBuildHasher>::new(b, bytes, msb); Box::new(move || Box::new(GeoDiffCount::new(c.clone()))) } BucketType::U32 => { - let c = VariableConfig::<_, u32>::new(b, bytes, msb); + let c = + VariableConfig::<_, u32, UnstableDefaultBuildHasher>::new(b, bytes, msb); Box::new(move || Box::new(GeoDiffCount::new(c.clone()))) } BucketType::U64 => { - let c = VariableConfig::<_, u64>::new(b, bytes, msb); + let c = + VariableConfig::<_, u64, UnstableDefaultBuildHasher>::new(b, bytes, msb); Box::new(move || Box::new(GeoDiffCount::new(c.clone()))) } } @@ -185,19 +189,22 @@ static SIMULATION_CONFIG_FROM_STR: Lazy> = Lazy::new match t { BucketType::U8 => { - let c = VariableConfig::<_, u8>::new(b, bytes, msb); + let c = VariableConfig::<_, u8, UnstableDefaultBuildHasher>::new(b, bytes, msb); Box::new(move || Box::new(GeoDistinctCount::new(c.clone()))) } BucketType::U16 => { - let c = VariableConfig::<_, u16>::new(b, bytes, msb); + let c = + VariableConfig::<_, u16, UnstableDefaultBuildHasher>::new(b, bytes, msb); Box::new(move || Box::new(GeoDistinctCount::new(c.clone()))) } BucketType::U32 => { - let c = VariableConfig::<_, u32>::new(b, bytes, msb); + let c = + VariableConfig::<_, u32, UnstableDefaultBuildHasher>::new(b, bytes, msb); Box::new(move || Box::new(GeoDistinctCount::new(c.clone()))) } BucketType::U64 => { - let c = VariableConfig::<_, u64>::new(b, bytes, msb); + let c = + VariableConfig::<_, u64, UnstableDefaultBuildHasher>::new(b, bytes, msb); Box::new(move || Box::new(GeoDistinctCount::new(c.clone()))) } } diff --git a/crates/geo_filters/evaluation/performance.rs b/crates/geo_filters/evaluation/performance.rs index 4d7d706..31a3fee 100644 --- a/crates/geo_filters/evaluation/performance.rs +++ b/crates/geo_filters/evaluation/performance.rs @@ -1,4 +1,5 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use geo_filters::build_hasher::UnstableDefaultBuildHasher; use geo_filters::config::VariableConfig; use geo_filters::diff_count::{GeoDiffCount, GeoDiffCount13}; use geo_filters::distinct_count::GeoDistinctCount13; @@ -20,7 +21,7 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); group.bench_function("geo_diff_count_var_13", |b| { - let c = VariableConfig::<_, u32>::new(13, 7680, 256); + let c = VariableConfig::<_, u32, UnstableDefaultBuildHasher>::new(13, 7680, 256); b.iter(move || { let mut gc = GeoDiffCount::new(c.clone()); for i in 0..*size { @@ -59,7 +60,7 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); group.bench_function("geo_diff_count_var_13", |b| { - let c = VariableConfig::<_, u32>::new(13, 7680, 256); + let c = VariableConfig::<_, u32, UnstableDefaultBuildHasher>::new(13, 7680, 256); b.iter(move || { let mut gc = GeoDiffCount::new(c.clone()); for i in 0..*size { @@ -104,7 +105,7 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); group.bench_function("geo_diff_count_var_13", |b| { - let c = VariableConfig::<_, u32>::new(13, 7680, 256); + let c = VariableConfig::<_, u32, UnstableDefaultBuildHasher>::new(13, 7680, 256); b.iter(move || { let mut gc1 = GeoDiffCount::new(c.clone()); let mut gc2 = GeoDiffCount::new(c.clone()); diff --git a/crates/geo_filters/src/build_hasher.rs b/crates/geo_filters/src/build_hasher.rs new file mode 100644 index 0000000..0509bfd --- /dev/null +++ b/crates/geo_filters/src/build_hasher.rs @@ -0,0 +1,38 @@ +use std::hash::{BuildHasher, BuildHasherDefault, DefaultHasher, Hasher as _}; + +use fnv::FnvBuildHasher; + +/// Trait for a hasher factory that can be used to produce hashers +/// for use with geometric filters. +/// +/// It is a super set of [`BuildHasher`], enforcing additional requirements +/// on the hasher builder that are required for the geometric filters and +/// surrounding code. +/// +/// When performing operations across two different geometric filters, +/// the hashers must be equal, i.e. they must produce the same hash for the +/// same input. +pub trait ReproducibleBuildHasher: BuildHasher + Default + Clone { + #[inline] + fn debug_assert_hashers_eq() { + // In debug builds we check that hash outputs are the same for + // self and other. The library user should only have implemented + // our build hasher trait if this is already true, but we check + // here in case they have implemented the trait in error. + debug_assert_eq!( + Self::default().build_hasher().finish(), + Self::default().build_hasher().finish(), + "Hashers produced by ReproducibleBuildHasher do not produce the same output with the same input" + ); + } +} + +/// Note that this `BuildHasher` has a consistent implementation of `Default` +/// but is NOT stable across releases of Rust. It is therefore dangerous +/// to use if you plan on serializing the geofilters and reusing them due +/// to the fact that you can serialize a filter made with one version and +/// deserialize with another version of the hasher factor. +pub type UnstableDefaultBuildHasher = BuildHasherDefault; + +impl ReproducibleBuildHasher for UnstableDefaultBuildHasher {} +impl ReproducibleBuildHasher for FnvBuildHasher {} diff --git a/crates/geo_filters/src/config.rs b/crates/geo_filters/src/config.rs index e22d6df..d66da44 100644 --- a/crates/geo_filters/src/config.rs +++ b/crates/geo_filters/src/config.rs @@ -2,7 +2,7 @@ use std::{marker::PhantomData, sync::Arc}; -use crate::Method; +use crate::{build_hasher::ReproducibleBuildHasher, Method}; mod bitchunks; mod buckets; @@ -30,8 +30,9 @@ use once_cell::sync::Lazy; /// Those conversions can be shared across multiple geo filter instances. This way, the /// conversions can also be optimized via e.g. lookup tables without paying the cost with every /// new geo filter instance again and again. -pub trait GeoConfig: Clone + Eq + Sized + Send + Sync { +pub trait GeoConfig: Clone + Eq + Sized { type BucketType: IsBucketType + 'static; + type BuildHasher: ReproducibleBuildHasher; /// The number of most-significant bits that are stored sparsely as positions. fn max_msb_len(&self) -> usize; @@ -79,9 +80,16 @@ pub trait GeoConfig: Clone + Eq + Sized + Send + Sync { /// Instantiating this type may panic if `T` is too small to hold the maximum possible /// bucket id determined by `B`, or `B` is larger than the largest statically defined /// lookup table. -#[derive(Clone, Eq, PartialEq)] -pub struct FixedConfig { - _phantom: PhantomData<(M, T)>, +#[derive(Clone)] +pub struct FixedConfig< + M: Method, + T, + const B: usize, + const BYTES: usize, + const MSB: usize, + H: ReproducibleBuildHasher, +> { + _phantom: PhantomData<(M, T, H)>, } impl< @@ -90,9 +98,11 @@ impl< const B: usize, const BYTES: usize, const MSB: usize, - > GeoConfig for FixedConfig + H: ReproducibleBuildHasher, + > GeoConfig for FixedConfig { type BucketType = T; + type BuildHasher = H; #[inline] fn max_msb_len(&self) -> usize { @@ -148,42 +158,76 @@ impl< const B: usize, const BYTES: usize, const MSB: usize, - > Default for FixedConfig + H: ReproducibleBuildHasher, + > Default for FixedConfig { fn default() -> Self { assert_bucket_type_large_enough::(B); assert_buckets_within_estimation_bound(B, BYTES * BITS_PER_BYTE); + assert!( B < M::get_lookups().len(), "B = {} is not available for fixed config, requires B < {}", B, M::get_lookups().len() ); + Self { _phantom: PhantomData, } } } +impl< + M: Method + Lookups, + T: IsBucketType, + const B: usize, + const BYTES: usize, + const MSB: usize, + H: ReproducibleBuildHasher, + > PartialEq for FixedConfig +{ + fn eq(&self, _other: &Self) -> bool { + H::debug_assert_hashers_eq(); + + // The values of the fixed config are provided at compile time + // so no runtime computation is required + true + } +} + +impl< + M: Method + Lookups, + T: IsBucketType, + const B: usize, + const BYTES: usize, + const MSB: usize, + H: ReproducibleBuildHasher, + > Eq for FixedConfig +{ +} + /// Geometric filter configuration using dynamic lookup tables. #[derive(Clone)] -pub struct VariableConfig { +pub struct VariableConfig { b: usize, bytes: usize, msb: usize, - _phantom: PhantomData<(M, T)>, + _phantom: PhantomData<(M, T, H)>, lookup: Arc, } -impl Eq for VariableConfig {} +impl Eq for VariableConfig {} -impl PartialEq for VariableConfig { +impl PartialEq for VariableConfig { fn eq(&self, other: &Self) -> bool { + H::debug_assert_hashers_eq(); + self.b == other.b && self.bytes == other.bytes && self.msb == other.msb } } -impl VariableConfig { +impl VariableConfig { /// Returns a new configuration value. See [`FixedConfig`] for the meaning /// of the parameters. This functions computes a new lookup table every time /// it is invoked, so make sure to share the resulting value as much as possible. @@ -205,8 +249,11 @@ impl VariableConfig { } } -impl GeoConfig for VariableConfig { +impl GeoConfig + for VariableConfig +{ type BucketType = T; + type BuildHasher = H; #[inline] fn max_msb_len(&self) -> usize { diff --git a/crates/geo_filters/src/config/buckets.rs b/crates/geo_filters/src/config/buckets.rs index 0d673c0..55cac2b 100644 --- a/crates/geo_filters/src/config/buckets.rs +++ b/crates/geo_filters/src/config/buckets.rs @@ -21,8 +21,7 @@ where fn into_block(self) -> u64 { assert!( self.into_usize() < BITS_PER_BLOCK, - "position in block must be less then 64, got {:?}", - self + "position in block must be less then 64, got {self:?}", ); 1u64 << self.into_usize() } diff --git a/crates/geo_filters/src/config/lookup.rs b/crates/geo_filters/src/config/lookup.rs index 1e73757..2081f72 100644 --- a/crates/geo_filters/src/config/lookup.rs +++ b/crates/geo_filters/src/config/lookup.rs @@ -54,13 +54,13 @@ mod tests { #[test] fn test_lookup_7() { let var = lookup_random_hashes_variance::<7>(1 << 16); - assert!(var < 1e-4, "variance {} too large", var); + assert!(var < 1e-4, "variance {var} too large"); } #[test] fn test_lookup_13() { let var = lookup_random_hashes_variance::<13>(1 << 16); - assert!(var < 1e-4, "variance {} too large", var); + assert!(var < 1e-4, "variance {var} too large"); } fn lookup_random_hashes_variance(n: u64) -> f64 { diff --git a/crates/geo_filters/src/diff_count.rs b/crates/geo_filters/src/diff_count.rs index 0c46263..7e784c6 100644 --- a/crates/geo_filters/src/diff_count.rs +++ b/crates/geo_filters/src/diff_count.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::cmp::Ordering; +use std::hash::BuildHasher as _; use std::mem::{size_of, size_of_val}; use crate::config::{ @@ -304,6 +305,10 @@ pub(crate) fn masked>( /// Computes an xor of the two underlying bitsets. /// This operation corresponds to computing the symmetric difference of the two /// sets represented by the GeoDiffCounts. +/// +/// # Panics +/// +/// Panics if the configuration of the geofilters is not identical. pub(crate) fn xor>( diff_count: &GeoDiffCount<'_, C>, other: &GeoDiffCount<'_, C>, @@ -312,6 +317,7 @@ pub(crate) fn xor>( diff_count.config == other.config, "combined filters must have the same configuration" ); + GeoDiffCount::from_bit_chunks( diff_count.config.clone(), xor_bit_chunks(diff_count.bit_chunks(), other.bit_chunks()).peekable(), @@ -323,6 +329,11 @@ impl> Count for GeoDiffCount<'_, C> { self.xor_bit(self.config.hash_to_bucket(hash)); } + fn push(&mut self, item: I) { + let build_hasher = C::BuildHasher::default(); + self.push_hash(build_hasher.hash_one(item)); + } + fn push_sketch(&mut self, other: &Self) { *self = xor(self, other); } @@ -341,6 +352,7 @@ impl> Count for GeoDiffCount<'_, C> { fn bytes_in_memory(&self) -> usize { let Self { config, msb, lsb } = self; + size_of_val(config) + msb.len() * size_of::() + lsb.bytes_in_memory() } } @@ -350,7 +362,10 @@ mod tests { use itertools::Itertools; use rand::{RngCore, SeedableRng}; - use crate::config::{iter_ones, tests::test_estimate, FixedConfig}; + use crate::{ + build_hasher::UnstableDefaultBuildHasher, + config::{iter_ones, tests::test_estimate, FixedConfig}, + }; use super::*; @@ -359,7 +374,8 @@ mod tests { // // scripts/accuracy -n 10000 geo_diff/u16/b=7/bytes=50/msb=10 // - type GeoDiffCount7_50<'a> = GeoDiffCount<'a, FixedConfig>; + type GeoDiffCount7_50<'a> = + GeoDiffCount<'a, FixedConfig>; #[test] fn test_geo_count() { @@ -374,6 +390,7 @@ mod tests { (10000000, 10194611.0), ] { let mut geo_count = GeoDiffCount13::default(); + (0..n).for_each(|i| geo_count.push(i)); assert_eq!(result, geo_count.size()); } diff --git a/crates/geo_filters/src/diff_count/config.rs b/crates/geo_filters/src/diff_count/config.rs index 7ad7447..365c04d 100644 --- a/crates/geo_filters/src/diff_count/config.rs +++ b/crates/geo_filters/src/diff_count/config.rs @@ -1,5 +1,6 @@ use once_cell::sync::Lazy; +use crate::build_hasher::UnstableDefaultBuildHasher; use crate::config::EstimationLookup; use crate::config::FixedConfig; use crate::config::HashToBucketLookup; @@ -17,7 +18,7 @@ use crate::Diff; // // scripts/accuracy -n 10000 geo_diff/u16/b=7/bytes=112/msb={8,12,16,20} // -pub type GeoDiffConfig7 = FixedConfig; +pub type GeoDiffConfig7 = FixedConfig; /// Diff count configuration with a relative error standard deviation of ~0.015. // @@ -29,7 +30,7 @@ pub type GeoDiffConfig7 = FixedConfig; // // scripts/accuracy -n 1000 geo_diff/u32/b=13/bytes=7138/msb={128,192,256,384,512} // -pub type GeoDiffConfig13 = FixedConfig; +pub type GeoDiffConfig13 = FixedConfig; impl Lookups for Diff { #[inline] @@ -124,7 +125,7 @@ mod tests { #[test] fn test_bit_from_hash() { - let config = GeoDiffConfig7::default(); + let config = GeoDiffConfig7::::default(); assert_eq!(config.hash_to_bucket(u64::MAX), 0); assert_eq!( config.hash_to_bucket(0) as usize, @@ -169,7 +170,7 @@ mod tests { #[test] fn test_estimation_lut_7() { - let c = GeoDiffConfig7::default(); + let c = GeoDiffConfig7::::default(); let err = (0..600) .step_by(1) .map(|i| { @@ -188,7 +189,7 @@ mod tests { #[test] fn test_estimation_lut_13() { - let c = GeoDiffConfig13::default(); + let c = GeoDiffConfig13::::default(); let err = (0..24000) .step_by(100) .map(|i| { diff --git a/crates/geo_filters/src/distinct_count.rs b/crates/geo_filters/src/distinct_count.rs index 6795cb8..8c2465c 100644 --- a/crates/geo_filters/src/distinct_count.rs +++ b/crates/geo_filters/src/distinct_count.rs @@ -1,6 +1,7 @@ //! Geometric filter implementation for distinct count. use std::collections::VecDeque; +use std::hash::BuildHasher as _; use std::mem::{size_of, size_of_val}; use crate::config::{ @@ -133,6 +134,11 @@ impl> Count for GeoDistinctCount<'_, C> { self.set_bit(self.config.hash_to_bucket(hash)); } + fn push(&mut self, item: I) { + let build_hasher = C::BuildHasher::default(); + self.push_hash(build_hasher.hash_one(item)); + } + fn push_sketch(&mut self, other: &Self) { *self = or(self, other) } @@ -216,7 +222,8 @@ fn or>( a.config == b.config, "combined filters must have the same configuration" ); - GeoDistinctCount::from_bit_chunks( + + GeoDistinctCount::<'static, C>::from_bit_chunks( a.config.clone(), or_bit_chunks(a.bit_chunks(), b.bit_chunks()).peekable(), ) @@ -227,6 +234,7 @@ mod tests { use itertools::Itertools; use rand::{RngCore, SeedableRng}; + use crate::build_hasher::UnstableDefaultBuildHasher; use crate::config::{iter_ones, tests::test_estimate, FixedConfig, VariableConfig}; use crate::evaluation::simulation::simulate; @@ -234,7 +242,8 @@ mod tests { #[test] fn test_lookup_table() { - let c = FixedConfig::::default(); + let c = + FixedConfig::::default(); for i in 0..c.max_bytes() * 4 { let hash = (c.phi_f64().powf(i as f64 + 0.5) * u64::MAX as f64).round() as u64; let a = c.hash_to_bucket(hash); @@ -245,6 +254,11 @@ mod tests { #[test] fn test_geo_count() { + // Pairs of (n, expected) where n is the number of inserted items + // and expected is the expected size of the GeoDistinctCount. + // The output matching the expected values is dependent on the configuration + // and hashing function. Changes to these will lead to different results and the + // test will need to be updated. for (n, result) in [ (10, 10.0021105), (100, 100.21153), @@ -358,7 +372,11 @@ mod tests { let msb = golden_section_min(1.0, 1000.0, |msb| { simulate( || { - Box::new(GeoDistinctCount::new(VariableConfig::<_, u32>::new( + Box::new(GeoDistinctCount::new(VariableConfig::< + _, + u32, + UnstableDefaultBuildHasher, + >::new( 13, 7800, (7800 - (msb.round() as usize) * 8) / 3, diff --git a/crates/geo_filters/src/distinct_count/config.rs b/crates/geo_filters/src/distinct_count/config.rs index 6498fe5..1ac5163 100644 --- a/crates/geo_filters/src/distinct_count/config.rs +++ b/crates/geo_filters/src/distinct_count/config.rs @@ -1,5 +1,6 @@ use once_cell::sync::Lazy; +use crate::build_hasher::UnstableDefaultBuildHasher; use crate::config::EstimationLookup; use crate::config::FixedConfig; use crate::config::HashToBucketLookup; @@ -18,7 +19,8 @@ use crate::Distinct; // // scripts/accuracy -n 10000 geo_distinct/u16/b=7/bytes=136/msb={8,16,32,64} // -pub type GeoDistinctConfig7 = FixedConfig; +pub type GeoDistinctConfig7 = + FixedConfig; /// Distinct count configuration with a relative error standard deviation of ~0.0075. /// Uses at most 9248 bytes of memory. @@ -31,7 +33,8 @@ pub type GeoDistinctConfig7 = FixedConfig; // // scripts/accuracy -n 10000 geo_distinct/u32/b=13/bytes=9216/msb={128,192,256,320,512,640} // -pub type GeoDistinctConfig13 = FixedConfig; +pub type GeoDistinctConfig13 = + FixedConfig; impl Lookups for Distinct { #[inline] @@ -107,7 +110,7 @@ mod tests { #[test] fn test_estimation_lut_7() { - let c = GeoDistinctConfig7::default(); + let c = GeoDistinctConfig7::::default(); let err = (0..600) .step_by(1) .map(|i| { @@ -126,7 +129,7 @@ mod tests { #[test] fn test_estimation_lut_13() { - let c = GeoDistinctConfig13::default(); + let c = GeoDistinctConfig13::::default(); let err = (0..24000) .step_by(1) .map(|i| { diff --git a/crates/geo_filters/src/evaluation/hll.rs b/crates/geo_filters/src/evaluation/hll.rs index 30b74bf..8c506c0 100644 --- a/crates/geo_filters/src/evaluation/hll.rs +++ b/crates/geo_filters/src/evaluation/hll.rs @@ -8,7 +8,10 @@ use std::{ use hyperloglogplus::{HyperLogLog, HyperLogLogPlus}; -use crate::{Count, Distinct}; +use crate::{ + build_hasher::{ReproducibleBuildHasher, UnstableDefaultBuildHasher}, + Count, Distinct, +}; /// Uses at most 192 bytes. /// The relative error has a standard deviation of ~0.065. @@ -77,6 +80,7 @@ pub struct NoopHasher { hash: u64, } +#[derive(Clone, Default)] pub struct BuildNoopHasher {} impl Hasher for NoopHasher { @@ -87,7 +91,9 @@ impl Hasher for NoopHasher { #[inline] fn write(&mut self, _: &[u8]) { - todo!("") + unimplemented!( + "NoopHasher does not support arbitrary byte sequences. Use write_u64 instead" + ); } #[inline] @@ -104,13 +110,20 @@ impl BuildHasher for BuildNoopHasher { } } +impl ReproducibleBuildHasher for BuildNoopHasher {} + impl Count for Hll { fn push_hash(&mut self, hash: u64) { self.inner.borrow_mut().insert(&hash) } + fn push(&mut self, item: I) { + let build_hasher = UnstableDefaultBuildHasher::default(); + self.push_hash(build_hasher.hash_one(item)); + } + fn push_sketch(&mut self, _other: &Self) { - todo!() + unimplemented!() } fn size(&self) -> f32 { @@ -118,7 +131,7 @@ impl Count for Hll { } fn size_with_sketch(&self, _other: &Self) -> f32 { - todo!() + unimplemented!() } fn bytes_in_memory(&self) -> usize { diff --git a/crates/geo_filters/src/evaluation/simulation.rs b/crates/geo_filters/src/evaluation/simulation.rs index ef736fd..eedb7d7 100644 --- a/crates/geo_filters/src/evaluation/simulation.rs +++ b/crates/geo_filters/src/evaluation/simulation.rs @@ -98,7 +98,7 @@ pub fn run_simulations( println!("Parameters:"); println!(); println!(" number of configs = {}", configs.len()); - println!(" number of samples = {}", samples); + println!(" number of samples = {samples}"); println!(" number of sets = {}", set_sizes.len()); println!(); @@ -107,7 +107,7 @@ pub fn run_simulations( let results = configs .iter() .map(|(name, f)| { - print!(" {} ... ", name); + print!(" {name} ... "); std::io::stdout().flush().expect("stdout can be flushed"); let t = Instant::now(); let result = simulate(f, samples, set_sizes); diff --git a/crates/geo_filters/src/lib.rs b/crates/geo_filters/src/lib.rs index 819421b..c810877 100644 --- a/crates/geo_filters/src/lib.rs +++ b/crates/geo_filters/src/lib.rs @@ -7,13 +7,14 @@ //! Supports estimating the size of the union of two sets with a precision related to the estimated size. //! It has some similar properties as related filters like HyperLogLog, MinHash, etc, but uses less space. +pub mod build_hasher; pub mod config; pub mod diff_count; pub mod distinct_count; #[cfg(feature = "evaluation")] pub mod evaluation; -use std::hash::{Hash, Hasher}; +use std::hash::Hash; /// Marker trait to indicate the variant implemented by a [`Count`] instance. pub trait Method: Clone + Eq + PartialEq + Send + Sync {} @@ -33,10 +34,8 @@ pub trait Count { /// Add the given hash to the set. fn push_hash(&mut self, hash: u64); - /// Add the hash of the given item, computed with the default hasher, to the set. - fn push(&mut self, item: I) { - self.push_hash(item_to_hash(item)) - } + /// Add the hash of the given item, computed with the configured hasher, to the set. + fn push(&mut self, item: I); /// Add the given sketch to this one. /// If only the size of the combined set is needed, [`Self::size_with_sketch`] is more efficient and should be used. @@ -53,9 +52,6 @@ pub trait Count { fn bytes_in_memory(&self) -> usize; } -fn item_to_hash(item: I) -> u64 { - // TODO: replace with a stable hashing function! - let mut hasher = std::collections::hash_map::DefaultHasher::new(); - item.hash(&mut hasher); - hasher.finish() -} +#[doc = include_str!("../README.md")] +#[cfg(doctest)] +pub struct ReadmeDocTests; diff --git a/crates/geo_filters/tests/public_api.rs b/crates/geo_filters/tests/public_api.rs index d705d2d..4afff72 100644 --- a/crates/geo_filters/tests/public_api.rs +++ b/crates/geo_filters/tests/public_api.rs @@ -1,3 +1,5 @@ +use geo_filters::build_hasher::UnstableDefaultBuildHasher; + #[test] fn can_use_predefined_diff_count() { use geo_filters::diff_count::GeoDiffCount7; @@ -20,7 +22,7 @@ fn can_use_custom_diff_count() { fn can_use_diff_count_with_predefined_config_value() { use geo_filters::diff_count::{GeoDiffConfig7, GeoDiffCount}; use geo_filters::Count; - let c = GeoDiffConfig7::default(); + let c = GeoDiffConfig7::::default(); let mut f = GeoDiffCount::new(c); f.push(42); f.size(); @@ -31,7 +33,7 @@ fn can_use_diff_count_with_fixed_config_value() { use geo_filters::config::FixedConfig; use geo_filters::diff_count::GeoDiffCount; use geo_filters::Count; - let c = FixedConfig::<_, u16, 7, 128, 10>::default(); + let c = FixedConfig::<_, u16, 7, 128, 10, UnstableDefaultBuildHasher>::default(); let mut f = GeoDiffCount::new(c); f.push(42); f.size(); @@ -42,7 +44,7 @@ fn can_use_diff_count_with_variable_config_value() { use geo_filters::config::VariableConfig; use geo_filters::diff_count::GeoDiffCount; use geo_filters::Count; - let c = VariableConfig::<_, u16>::new(7, 128, 10); + let c = VariableConfig::<_, u16, UnstableDefaultBuildHasher>::new(7, 128, 10); let mut f = GeoDiffCount::new(c); f.push(42); f.size(); @@ -70,7 +72,7 @@ fn can_use_custom_distinct_count() { fn can_use_distinct_count_with_predefined_config_value() { use geo_filters::distinct_count::{GeoDistinctConfig7, GeoDistinctCount}; use geo_filters::Count; - let c = GeoDistinctConfig7::default(); + let c = GeoDistinctConfig7::::default(); let mut f = GeoDistinctCount::new(c); f.push(42); f.size(); @@ -81,7 +83,7 @@ fn can_use_distinct_count_with_fixed_config_value() { use geo_filters::config::FixedConfig; use geo_filters::distinct_count::GeoDistinctCount; use geo_filters::Count; - let c = FixedConfig::<_, u16, 7, 118, 11>::default(); + let c = FixedConfig::<_, u16, 7, 118, 11, UnstableDefaultBuildHasher>::default(); let mut f = GeoDistinctCount::new(c); f.push(42); f.size(); @@ -92,7 +94,7 @@ fn can_use_distinct_count_with_variable_config_value() { use geo_filters::config::VariableConfig; use geo_filters::distinct_count::GeoDistinctCount; use geo_filters::Count; - let c = VariableConfig::<_, u16>::new(7, 118, 11); + let c = VariableConfig::<_, u16, UnstableDefaultBuildHasher>::new(7, 118, 11); let mut f = GeoDistinctCount::new(c); f.push(42); f.size(); diff --git a/crates/string-offsets/src/lib.rs b/crates/string-offsets/src/lib.rs index 02a26aa..35900ad 100644 --- a/crates/string-offsets/src/lib.rs +++ b/crates/string-offsets/src/lib.rs @@ -449,7 +449,7 @@ mod tests { 0 => 0, 1..=3 => 1, 4 => 2, - _ => panic!("invalid utf8 char width: {}", len), + _ => panic!("invalid utf8 char width: {len}"), } }