diff --git a/rust/radix20/src/errors.rs b/rust/radix20/src/errors.rs --- a/rust/radix20/src/errors.rs +++ b/rust/radix20/src/errors.rs @@ -3,11 +3,27 @@ // This software may be used and distributed according to the terms of the // GNU General Public License version 2 or any later version. +use types::KeyId; + error_chain! { + foreign_links { + Io(::std::io::Error); + } + errors { OffsetOverflow(offset: u64) { description("offset overflow") display("offset {} is out of range", offset) } + KeyConflict { + description("same key with different IDs") + } + KeyIncomplete(key_id: KeyId) { + description("incomplete key") + display("key ({:?}) is incomplete", key_id) + } + AmbiguousPrefix { + description("ambiguous prefix") + } } } diff --git a/rust/radix20/src/lib.rs b/rust/radix20/src/lib.rs --- a/rust/radix20/src/lib.rs +++ b/rust/radix20/src/lib.rs @@ -23,3 +23,4 @@ pub mod types; pub mod errors; pub mod base16; +pub mod radix; diff --git a/rust/radix20/src/radix.rs b/rust/radix20/src/radix.rs new file mode 100644 --- /dev/null +++ b/rust/radix20/src/radix.rs @@ -0,0 +1,475 @@ +// Copyright 2017 Facebook, Inc. +// +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2 or any later version. + +//! Main radix buffer implementation that maintains `Key` to `KeyId` mapping. + +use base16::ToBase16Iter; +use errors::{Result, ErrorKind}; +use types::{Key, KeyId, Resize}; + +/// Represent a pointer (offset) to a radix node which contains 16 pointers to other `RadixNode`, +/// `KeyId`, or nothing. +#[derive(Clone, Copy)] +struct RadixOffset(u64); + +impl RadixOffset { + #[inline] + pub fn new(offset: u64) -> Self { RadixOffset(offset) } + + /// Append an empty `RadixNode` (`[u64; 16]`) at the end of a buffer + #[inline] + pub fn create + AsRef<[u64]>>(vec: &mut RB) -> Result { + let mut pos = vec.as_ref().len(); + // The LSB is reserved for marking leafs. Align offsets to multiples of 2. + if pos & 1 == 1 { + pos += 1; + } + if (pos as u64) as usize != pos { + return Err(ErrorKind::OffsetOverflow(pos as u64).into()); + } + vec.resize(pos + 16, 0); + Ok(RadixOffset(pos as u64)) + } + + /// Follow a base16 sequence. Return an optionally matched `KeyId` and the last `RadixNode`, + /// sequence length and the last index. + #[inline] + pub fn follow, KI: Iterator>(self, buf: &RB, seq: KI) + -> Result<(Option, (RadixOffset, usize, u8))> { + let buf = buf.as_ref(); + let mut radix = self; + for (i, b) in seq.enumerate() { + debug_assert!(b < 16); + + let pos = radix.0 + u64::from(b); + if pos as usize >= buf.len() { + return Err(ErrorKind::OffsetOverflow(pos).into()); + } + + let v = buf[pos as usize].to_le(); + if v == 0 { + // Missing + return Ok((None, (radix, i, b))); + } else if v & 1 != 0 { + // KeyId + return Ok((Some(KeyId::from(v >> 1)), (radix, i, b))); + } else { + // RadixOffset + radix = RadixOffset::new(v); + } + } + + // The base16 sequence is too short and does not match a leaf node + Err(ErrorKind::AmbiguousPrefix.into()) + } + + /// Rewrite specified entry to point to another `RadixNode`. + #[inline] + pub fn write_radix>(&self, vec: &mut RB, index: u8, node: RadixOffset) + -> Result<()> { + self.write_raw(vec, index, node.0) + } + + /// Rewrite specified entry to point to a `KeyId`. + #[inline] + pub fn write_key_id>(&self, vec: &mut RB, index: u8, key_id: KeyId) + -> Result<()> { + let id: u64 = key_id.into(); + if id > 0x7fff_ffff_ffff_ffff { + return Err(ErrorKind::OffsetOverflow(id).into()); + } + self.write_raw(vec, index, (id << 1) | 1) + } + + #[inline] + fn write_raw>(&self, vec: &mut RB, index: u8, offset: u64) -> Result<()> { + debug_assert!(index < 16); + let vec = vec.as_mut(); + let pos = self.0 + u64::from(index); + if pos as usize > vec.len() { + return Err(ErrorKind::OffsetOverflow(pos).into()); + } + let v: u64 = offset; + vec[pos as usize] = v.to_le(); + Ok(()) + } +} + +// Public APIs + +/// Lookup a given `Key`. Return an optional potentially matched `KeyId`. +/// The caller is responsible to check whether `KeyId` matches the given `Key` or not. +#[inline] +pub fn radix_lookup_unchecked>(radix_buf: &R, offset: u64, key: &Key) + -> Result> { + let (key_id, _) = RadixOffset::new(offset).follow( + radix_buf, + key.to_base16iter(), + )?; + Ok(key_id) +} + +/// Lookup a given `Key`. Return a verified `KeyId` or `None`. +#[inline] +pub fn radix_lookup(radix_buf: &R, offset: u64, key: &Key, key_buf: &K) + -> Result> +where + R: AsRef<[u64]>, + K: AsRef<[u8]>, +{ + let key_id = radix_lookup_unchecked(radix_buf, offset, key)?; + if let Some(id) = key_id { + if *key_read(key_buf, id)? != *key { + return Ok(None); + } + } + Ok(key_id) +} + +/// Lookup a unique `KeyId` given a prefix of a base16 sequence. +#[inline] +pub fn radix_prefix_lookup(radix_buf: &R, offset: u64, prefix: P, key_buf: &K) + -> Result> +where + R: AsRef<[u64]>, + P: AsRef<[u8]>, + K: AsRef<[u8]>, +{ + let base16seq = prefix.as_ref(); + let root = RadixOffset::new(offset); + let (key_id, _) = root.follow(radix_buf, base16seq.iter().cloned())?; + if let Some(id) = key_id { + let key = key_read(key_buf, id)?; + let matched = key.to_base16iter().zip(base16seq.iter()).all(|(b1, &b2)| { + b1 == b2 + }); + if !matched || key.len() * 2 < base16seq.len() { + return Ok(None); + } + } + Ok(key_id) +} + +/// Insert a `Key` to `KeyId` relationship into the radix tree. +#[inline] +pub fn radix_insert(radix_buf: &mut R, offset: u64, key_id: KeyId, key_buf: &K) -> Result<()> +where + R: Resize + AsRef<[u64]> + AsMut<[u64]>, + K: AsRef<[u8]>, +{ + let new_key = key_read(key_buf, key_id)?; + radix_insert_with_key(radix_buf, offset, key_id, new_key, key_buf) +} + +#[inline] +pub fn radix_insert_with_key(radix_buf: &mut R, off: u64, id: KeyId, key: &Key, key_buf: &K) + -> Result<()> +where + R: Resize + AsRef<[u64]> + AsMut<[u64]>, + K: AsRef<[u8]>, +{ + let new_key = key; + let new_key_id = id; + let root = RadixOffset::new(off); + let (old_key_id, (radix, i, b)) = root.follow(radix_buf, key.to_base16iter())?; + match old_key_id { + Some(old_key_id) => { + // No need to re-insert a same key + if old_key_id == new_key_id { + return Ok(()); + } + + // Need to do a leaf split + let old_key = key_read(key_buf, old_key_id)?; + + // Find common prefix starting from the next base16 integer + let mut common_len = 0; + let old_iter = old_key.to_base16iter().skip(i + 1); + let new_iter = new_key.to_base16iter().skip(i + 1); + for (b1, b2) in old_iter.zip(new_iter) { + if b1 == b2 { + common_len += 1; + } else { + // Got a chain of radix nodes to write back + let mut node = RadixOffset::create(radix_buf)?; + node.write_key_id(radix_buf, b1, old_key_id)?; + node.write_key_id(radix_buf, b2, new_key_id)?; + for k in new_iter.take(common_len).rev() { + let new_node = RadixOffset::create(radix_buf)?; + new_node.write_radix(radix_buf, k, node)?; + node = new_node; + } + return radix.write_radix(radix_buf, b, node); + } + } + + // old_key is key, is a prefix of new_key, or vice-versa + Err(ErrorKind::KeyConflict.into()) + } + None => radix.write_key_id(radix_buf, b, new_key_id), + } +} + +/// Read `Key` at specified offset. +#[inline] +pub fn key_read>(key_buf: &K, key_id: KeyId) -> Result<&Key> { + let key_buf = key_buf.as_ref(); + let start_pos: u64 = key_id.into(); + let end_pos = start_pos + Key::fixed_len() as u64; + if key_buf.len() < end_pos as usize { + Err(ErrorKind::KeyIncomplete(key_id).into()) + } else { + let offset: u64 = key_id.into(); + unsafe { + let p = key_buf.as_ptr().offset(offset as isize) as *const Key; + Ok(&*p) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashSet; + use rand::{ChaChaRng, Rng}; + use test::Bencher; + use quickcheck::{Arbitrary, Gen}; + + #[test] + fn test_errors() { + let key_buf = vec![0u8]; + let mut radix_buf = vec![0u64; 15]; + let key = Key::default(); + let r = radix_insert_with_key(&mut radix_buf, 0, (1 << 63).into(), &key, &key_buf); + assert_eq!(r.unwrap_err().description(), "offset overflow"); + let r = radix_insert(&mut radix_buf, 0, 1.into(), &key_buf); + assert_eq!(r.unwrap_err().description(), "incomplete key"); + let r = radix_insert(&mut radix_buf, 0, 30.into(), &key_buf); + assert_eq!( + format!("{}", r.unwrap_err()), + "key (KeyId(30)) is incomplete" + ); + let r = radix_insert_with_key(&mut radix_buf, 16, 1.into(), &key, &key_buf); + assert_eq!(format!("{}", r.unwrap_err()), "offset 16 is out of range"); + let r = radix_prefix_lookup(&radix_buf, 0, [0xf, 0xfu8], &key_buf); + assert_eq!(format!("{}", r.unwrap_err()), "offset 15 is out of range"); + } + + #[test] + fn test_prefix_lookup() { + let mut key_buf = vec![0u8]; + let mut radix_buf = vec![0u64; 16]; + let key1 = b"0123456789abcdefghij".into(); + let key1_id = key_append(&mut key_buf, &key1); + radix_insert(&mut radix_buf, 0, key1_id, &key_buf).expect("insert"); + let base16: Vec = key1.to_base16iter() + .chain([0u8; 10].iter().cloned()) + .collect(); + for i in 1..base16.len() { + let r = radix_prefix_lookup(&radix_buf, 0, &base16[0..i], &key_buf); + if i <= 40 { + assert_eq!(r.unwrap(), Some(key1_id)); + } else { + assert_eq!(r.unwrap(), None); + } + } + + let key2 = b"0123456789bbcdefghij".into(); + let key2_id = key_append(&mut key_buf, &key2); + radix_insert(&mut radix_buf, 0, key2_id, &key_buf).expect("insert"); + for i in 1..base16.len() { + let r = radix_prefix_lookup(&radix_buf, 0, &base16[0..i], &key_buf); + if i <= 21 { + assert_eq!(r.unwrap_err().description(), "ambiguous prefix") + } else if i <= 40 { + assert_eq!(r.unwrap(), Some(key1_id)); + } else { + assert_eq!(r.unwrap(), None); + } + } + } + + #[test] + fn test_compare_with_stdset_manual() { + // Manual crafted case which causes leaf splits + let keys = [ + "abc", + "a", + "", + "abcde", + "abcdg", + "abcd", + "abcxy", + "abcyz", + "b", + "xyz", + ]; + let set: HashSet = keys.iter() + .map(|s| { + let mut buf = [0u8; 20]; + let s = s.as_bytes(); + buf[0..s.len()].copy_from_slice(s); + Key::from(buf) + }) + .collect(); + assert!(check_with_stdset(set)); + } + + #[test] + fn test_multiple_roots() { + // Single radix buffer containing multiple roots + let count = 1000; + let key_buf = randomized_key_buf(count); + let mut radix_buf = vec![0u64; 61]; + for i in 0..count { + let key_id: KeyId = ((i * Key::fixed_len()) as u64).into(); + let mut len_deltas = HashSet::::new(); + for j in 0..3 { + let orig_len = radix_buf.len() + (radix_buf.len() & 1); + radix_insert(&mut radix_buf, 1 + j * 17, key_id, &key_buf).expect("insert"); + len_deltas.insert(1 + radix_buf.len() - orig_len); + } + // Added size to the radix buffer should be the same regardless of the root offset + assert_eq!(len_deltas.len(), 1); + } + for i in 0..count { + let key_id: KeyId = ((i * Key::fixed_len()) as u64).into(); + let key = key_read(&key_buf, key_id).unwrap(); + for j in 0..3 { + assert_eq!( + radix_lookup(&radix_buf, 1 + j * 17, &key, &key_buf) + .unwrap() + .unwrap(), + key_id + ); + } + } + } + + quickcheck! { + fn test_compare_with_stdset_sparse(std_set: HashSet) -> bool { + check_with_stdset(std_set) + } + + fn test_compare_with_stdset_dense(std_set: HashSet) -> bool { + let std_set: HashSet = std_set.iter().map(|x| { + let mut buf = [0u8; 20]; + buf[0] = (x >> 8) as u8; + buf[1] = (x & 0xff) as u8; + Key::from(buf) + }).collect(); + check_with_stdset(std_set) + } + + fn test_duplicated_key_ids(key: Key) -> bool { + check_duplicated_key_ids(key) + } + } + + // Compare with `HashSet`. + fn check_with_stdset(std_set: HashSet) -> bool { + let mut key_buf = vec![0u8]; + let mut radix_buf = vec![0u64; 16]; + + // Insert to radix tree + for key in &std_set { + if let Ok(Some(_)) = radix_lookup(&radix_buf, 0, key, &key_buf) { + return false; + } + let key_id = key_append(&mut key_buf, key); + radix_insert(&mut radix_buf, 0, key_id, &key_buf).expect("insert"); + radix_insert_with_key(&mut radix_buf, 0, key_id, key, &key_buf).expect("insert"); + } + + // Test key existence + for &key in &std_set { + for bit in 0..161 { + let mut new_key = key.clone(); + if bit > 0 { + let mut buf: [u8; 20] = *key; + buf[(bit - 1) / 8] ^= 1u8 << ((bit - 1) % 8); + new_key = buf.into(); + } + let std_exist = std_set.contains(&new_key); + let radix_exist = radix_lookup(&radix_buf, 0, &new_key, &key_buf) + .unwrap() + .is_some(); + if std_exist != radix_exist { + return false; + } + } + } + true + } + + // Duplicated `KeyId`s triggers the `KeyConflict` error. + fn check_duplicated_key_ids(key: Key) -> bool { + let mut key_buf = vec![0u8]; + let mut radix_buf = vec![0u64; 16]; + let key_id1 = key_append(&mut key_buf, &key); + let key_id2 = key_append(&mut key_buf, &key); + radix_insert(&mut radix_buf, 0, key_id1, &key_buf).expect("insert"); + radix_insert(&mut radix_buf, 0, key_id2, &key_buf) + .unwrap_err() + .description() == "same key with different IDs" + } + + const COUNT: usize = 51200; + + #[bench] + fn bench_insert(b: &mut Bencher) { + let key_buf = randomized_key_buf(COUNT); + b.iter(|| { batch_insert_radix_buf(&key_buf, COUNT); }) + } + + #[bench] + fn bench_unchecked_lookups(b: &mut Bencher) { + let key_buf = randomized_key_buf(COUNT); + let radix_buf = batch_insert_radix_buf(&key_buf, COUNT); + b.iter(|| for i in 0..COUNT { + let key_id = (i as u64 * 20).into(); + let key = key_read(&key_buf, key_id).unwrap(); + radix_lookup_unchecked(&radix_buf, 0, &key).expect("lookup"); + }) + } + + #[bench] + fn bench_lookups(b: &mut Bencher) { + let key_buf = randomized_key_buf(COUNT); + let radix_buf = batch_insert_radix_buf(&key_buf, COUNT); + b.iter(|| for i in 0..COUNT { + let key_id = (i as u64 * 20).into(); + let key = key_read(&key_buf, key_id).unwrap(); + radix_lookup(&radix_buf, 0, &key, &key_buf).expect("lookup"); + }) + } + + // Append `Key` to `key_buf`. Assign a new `KeyId` to it. + fn key_append(key_buf: &mut Vec, key: &Key) -> KeyId { + let pos = key_buf.len(); + key_buf.extend_from_slice(&key[..]); + KeyId::from(pos as u64) + } + + fn randomized_key_buf(count: usize) -> Vec { + let mut key_buf = vec![0u8; count * Key::fixed_len()]; + ChaChaRng::new_unseeded().fill_bytes(key_buf.as_mut()); + key_buf + } + + fn batch_insert_radix_buf(key_buf: &Vec, count: usize) -> Vec { + let mut radix_buf = vec![0u64; 16]; + for i in 0..count { + let key_id: KeyId = ((i * Key::fixed_len()) as u64).into(); + radix_insert(&mut radix_buf, 0, key_id, key_buf).expect("insert"); + } + radix_buf + } + + impl Arbitrary for Key { + fn arbitrary(gen: &mut G) -> Self { gen.gen() } + fn shrink(&self) -> Box> { Box::new(vec![].into_iter()) } + } +}