diff --git a/rust/radix204/src/errors.rs b/rust/radix204/src/errors.rs --- a/rust/radix204/src/errors.rs +++ b/rust/radix204/src/errors.rs @@ -6,4 +6,21 @@ use types::KeyId; error_chain! { + foreign_links { + BorrowMut(::std::cell::BorrowMutError); + } + + errors { + OffsetOverflow(offset: usize) { + description("offset overflow") + display("offset {} is out of range", offset) + } + KeyConflict { + description("same key with different IDs") + } + KeyIncomplete(key_id: KeyId) { + description("incomplete key") + display("key (id={}) is incomplete", key_id) + } + } } diff --git a/rust/radix204/src/lib.rs b/rust/radix204/src/lib.rs --- a/rust/radix204/src/lib.rs +++ b/rust/radix204/src/lib.rs @@ -20,3 +20,9 @@ mod constants; pub mod errors; pub mod base16; +pub mod radix; + +pub type RadixBuffer = radix::RadixBuffer; + +#[cfg(test)] +mod testutil; diff --git a/rust/radix204/src/radix.rs b/rust/radix204/src/radix.rs new file mode 100644 --- /dev/null +++ b/rust/radix204/src/radix.rs @@ -0,0 +1,353 @@ +// Copyright 2017 Facebook, Inc. +// +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2 or any later version. + +//! Main radix buffer implementation that maintains `Key` to `KeyId` mapping. + +use std::ops::DerefMut; + +use base16::ToBase16Iter; +use constants::KEY_ID_MAX; +use errors::{Result, ErrorKind}; +use types::{Key, KeyId, Offset, Shared}; + +// KeyBuffer and RadixBuffer. +// They might be changed to type parameters. ex. they might be backed by mmap. +type KB = Vec; +type RB = Vec; + +/// Represents pointer to 16 offsets that are rewritable in-place. +struct RadixNode(Offset); + +/// The return value of `RadixNode::follow`. +enum FollowResult { + Radix(RadixNode), + KeyId(KeyId), + Missing, +} + +impl RadixNode { + #[inline] + pub fn new(offset: Offset) -> Self { RadixNode(offset) } + + #[inline] + pub fn create(vec: &mut RB) -> Result { + let mut pos = vec.len(); + // The LSB is reserved for marking leaf. So align pos by 2. + if pos & 1 == 1 { + pos += 1; + } + vec.resize(pos + 16, 0); + Ok(RadixNode(pos as Offset)) + } + + /// Follow index by one step. + #[inline] + pub fn follow(&self, vec: &RB, index: u8) -> Result { + debug_assert!(index < 16); + let pos = (self.0 as usize) + usize::from(index); + let o = vec.get(pos); + match o { + None => Err(ErrorKind::OffsetOverflow(pos).into()), + Some(v) => { + let v = v.to_le(); + if v == 0 { + Ok(FollowResult::Missing) + } else if v & 1 != 0 { + Ok(FollowResult::KeyId(v >> 1)) + } else { + Ok(FollowResult::Radix(RadixNode::new(v))) + } + } + } + } + + /// Rewrite an index entry to point to another `RadixNode`. + #[inline] + pub fn write_radix(&self, vec: &mut RB, index: u8, node: &RadixNode) -> Result<()> { + self.write_raw_offset(vec, index, node.0) + } + + /// Rewrite an index entry to point to a `KeyId`. + #[inline] + pub fn write_key_id(&self, vec: &mut RB, index: u8, key_id: KeyId) -> Result<()> { + if key_id > KEY_ID_MAX { + return Err(ErrorKind::OffsetOverflow(key_id as usize).into()); + } + self.write_raw_offset(vec, index, (key_id << 1) | 1) + } + + #[inline] + fn write_raw_offset(&self, vec: &mut RB, index: u8, offset: Offset) -> Result<()> { + debug_assert!(index < 16); + let pos = (self.0 as usize) + usize::from(index); + if pos > vec.len() { + return Err(ErrorKind::OffsetOverflow(offset as usize).into()); + } + vec[pos] = offset.to_le(); + Ok(()) + } +} + +/// `RadixBuffer` provides efficient `Key` to `KeyId` lookups. It maintains two buffers: +/// - A key buffer for `Key` lookups. Caller is responsible for appending new keys there. +/// - A radix buffer for storing radix tree offsets. +pub struct RadixBuffer(Shared, Shared, Offset); + +impl RadixBuffer { + pub fn new(key_buf: Shared, radix_buf: Shared, root: Offset) -> Self { + RadixBuffer(key_buf, radix_buf, root) + } + + /// Lookup a `Key` + pub fn lookup(&self, key: &Key) -> Result> { + let result = self.lookup_unchecked(key); + if let Ok(Some(key_id)) = result { + // Double check if key_id matches + let key_to_check = self.read_key(key_id)?; + if key_to_check != *key { + return Ok(None); + } + } + result + } + + /// Find `KeyId` from a `Key`. Do not check if `KeyId` and `Key` matches. + pub fn lookup_unchecked(&self, key: &Key) -> Result> { + borrow_shared(&self.1, |rbuf| { + let mut radix = RadixNode::new(self.2); + for b in key.to_base16iter() { + match radix.follow(rbuf, b)? { + FollowResult::Radix(next_radix) => { + radix = next_radix; + } + FollowResult::Missing => { + return Ok(None); + } + FollowResult::KeyId(id) => return Ok(Some(id)), + } + } + Err(ErrorKind::KeyConflict.into()) + }) + } + + /// Insert when `KeyId` is known + pub fn insert(&self, new_key_id: KeyId) -> Result<()> { + let new_key = self.read_key(new_key_id)?; + self.insert_with_key(new_key_id, &new_key) + } + + /// Insert when both `KeyId` and `Key` are known + pub fn insert_with_key(&self, new_key_id: KeyId, new_key: &Key) -> Result<()> { + borrow_shared(&self.1, |rbuf| { + let mut radix = RadixNode::new(self.2); + for (i, b) in new_key.to_base16iter().enumerate() { + match radix.follow(rbuf, b)? { + FollowResult::Radix(next_radix) => { + radix = next_radix; + } + FollowResult::Missing => { + return radix.write_key_id(rbuf, b, new_key_id); + } + FollowResult::KeyId(old_key_id) => { + // No need to re-insert a same key + if old_key_id == new_key_id { + return Ok(()); + } + + // Need to do a leaf split + let old_key = self.read_key(old_key_id)?; + + // Find common prefix starting from the next base16 integer + let mut common = Vec::with_capacity(4); + + for (b1, b2) in old_key + .to_base16iter() + .zip(new_key.to_base16iter()) + .skip(i + 1) + { + if b1 == b2 { + // Not known what to do yet + common.push(b1); + } else { + // Got a chain of radix nodes to write back + let mut node = RadixNode::create(rbuf)?; + node.write_key_id(rbuf, b1, old_key_id)?; + node.write_key_id(rbuf, b2, new_key_id)?; + for &k in common.iter().rev() { + let new_node = RadixNode::create(rbuf)?; + new_node.write_radix(rbuf, k, &node)?; + node = new_node; + } + return radix.write_radix(rbuf, b, &node); + } + } + + // old_key is key, is a prefix of new_key, or vice-versa + return Err(ErrorKind::KeyConflict.into()); + } + } + } + Err(ErrorKind::KeyConflict.into()) + }) + } + + #[inline] + fn read_key(&self, key_id: KeyId) -> Result { + borrow_shared(&self.0, |kvec| { + let start_pos = key_id as usize; + let end_pos = start_pos + 20; + if kvec.len() < end_pos { + Err(ErrorKind::KeyIncomplete(key_id).into()) + } else { + let mut key: Key = Key::default(); + key.copy_from_slice(&kvec[start_pos..end_pos]); + Ok(key) + } + }) + } +} + +/// Borrow &mut B from Shared within func scope +#[inline] +pub fn borrow_shared(shared: &Shared, func: F) -> Result +where + F: FnOnce(&mut B) -> Result, +{ + let mut borrowed = shared.try_borrow_mut()?; + func(borrowed.deref_mut()) +} + +#[cfg(test)] +mod tests { + use super::*; + use testutil::*; + use constants::KEY_LEN; + use std::collections::HashSet; + use std::rc::Rc; + use std::cell::RefCell; + use rand::{thread_rng, Rng}; + use test::Bencher; + + type KB = Vec; + type RB = Vec; + + fn prepare_kbuf_radix() -> (Shared, Shared, RadixBuffer) { + let kbuf = Rc::new(RefCell::new(vec![])); + let rbuf = Rc::new(RefCell::new(vec![0u32; 17])); + let radix = RadixBuffer::new(Rc::clone(&kbuf), Rc::clone(&rbuf), 1); + (kbuf, rbuf, radix) + } + + fn append_key(kbuf: &Shared, key: &Key) -> KeyId { + borrow_shared(&kbuf, |kvec| { + let pos = kvec.len(); + kvec.extend_from_slice(key); + Ok(pos as Offset) + }).unwrap() + } + + fn check_duplicated_key_ids(v: (u64, u64, u32)) -> bool { + let (kbuf, _rbuf, radix) = prepare_kbuf_radix(); + let key = key_from_ints(v); + let key_id1 = append_key(&kbuf, &key); + let key_id2 = append_key(&kbuf, &key); + radix.insert(key_id1).expect("insert"); + radix.insert(key_id2).unwrap_err().description() == "same key with different IDs" + } + + fn check_with_stdset(std_set: HashSet<(u64, u64, u32)>) -> bool { + let (kbuf, _rbuf, radix) = prepare_kbuf_radix(); + for &v in &std_set { + let key = key_from_ints(v); + if radix.lookup(&key).unwrap().is_some() { + return false; + } + let key_id = append_key(&kbuf, &key); + radix.insert(key_id).expect("insert"); + radix.insert_with_key(key_id, &key).expect("insert"); + } + for &v in &std_set { + for &a in [ + (v.0, v.1, v.2), + (v.0, v.1, v.2 ^ 1), + (v.0, v.1, v.2 ^ 3), + (v.0, v.1 ^ 1, v.2), + (v.0 ^ (1 << 62), v.1, v.2), + ].iter() + { + let key = key_from_ints(a); + if std_set.contains(&a) ^ radix.lookup(&key).unwrap().is_some() { + return false; + } + } + } + true + } + + #[test] + fn test_errors() { + let (_kbuf, _rbuf, radix) = prepare_kbuf_radix(); + let key = key_from_ints((1, 2, 3)); + let r = radix.insert_with_key(1 << 31, &key); + assert_eq!(r.unwrap_err().description(), "offset overflow"); + assert_eq!(radix.insert(1).unwrap_err().description(), "incomplete key"); + } + + #[bench] + fn bench_10k_insert(b: &mut Bencher) { + const COUNT: KeyId = 10240; + let key_buf: Vec = thread_rng() + .gen_iter() + .take(COUNT as usize * KEY_LEN) + .collect(); + let kbuf = Rc::new(RefCell::new(key_buf)); + b.iter(|| { + let rbuf = Rc::new(RefCell::new(vec![0u32; 16])); + let radix = RadixBuffer::new(Rc::clone(&kbuf), Rc::clone(&rbuf), 0); + for key_id in 0..COUNT { + radix.insert(key_id * (KEY_LEN as Offset)).expect("insert"); + } + }) + } + + fn prepare_radix_keys(count: usize) -> (RadixBuffer, Vec) { + let mut rng = thread_rng(); + let (kbuf, _rbuf, radix) = prepare_kbuf_radix(); + let mut keys = vec![]; + for _ in 0..count { + let key = key_from_ints((rng.next_u64(), rng.next_u64(), rng.next_u32())); + let key_id = append_key(&kbuf, &key); + radix.insert_with_key(key_id, &key).expect("insert"); + keys.push(key); + } + (radix, keys) + } + + #[bench] + fn bench_10k_fast_lookups(b: &mut Bencher) { + let (radix, keys) = prepare_radix_keys(10240); + b.iter(|| for key in &keys { + radix.lookup_unchecked(&key).expect("lookup"); + }) + } + + #[bench] + fn bench_10k_slow_lookups(b: &mut Bencher) { + let (radix, keys) = prepare_radix_keys(10240); + b.iter(|| for key in &keys { + radix.lookup(&key).expect("lookup"); + }) + } + + quickcheck! { + fn test_compare_with_stdset(std_set: HashSet<(u64, u64, u32)>) -> bool { + check_with_stdset(std_set) + } + + fn test_duplicated_key_ids(v: (u64, u64, u32)) -> bool { + check_duplicated_key_ids(v) + } + } +} diff --git a/rust/radix204/src/testutil.rs b/rust/radix204/src/testutil.rs new file mode 100644 --- /dev/null +++ b/rust/radix204/src/testutil.rs @@ -0,0 +1,22 @@ +// Copyright 2017 Facebook, Inc. +// +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2 or any later version. + +use types::Key; +use std::mem::transmute; +use std::ptr::copy_nonoverlapping as copy; + +// quickcheck::Arbitrary is not implemented for Key and it cannot +// be implemented here. So just convert 3 integers to a Key. +// "pub" to make it available for other tests. +pub fn key_from_ints(v: (u64, u64, u32)) -> [u8; 20] { + let mut buf: Key = [0u8; 20]; + let p = (&mut buf[0]) as *mut u8; + unsafe { + copy(&transmute::(v.0.to_be()) as *const u8, p, 8); + copy(&transmute::(v.1.to_be()) as *const u8, p.offset(8), 8); + copy(&transmute::(v.2.to_be()) as *const u8, p.offset(16), 4); + } + buf +}