diff --git a/rust/radix204/src/errors.rs b/rust/radix204/src/errors.rs --- a/rust/radix204/src/errors.rs +++ b/rust/radix204/src/errors.rs @@ -6,10 +6,21 @@ use types::KeyId; error_chain! { + foreign_links { + BorrowMut(::std::cell::BorrowMutError); + } + errors { OffsetOverflow(offset: usize) { description("offset overflow") display("offset {} is out of range", offset) } + KeyConflict { + description("same key with different IDs") + } + KeyIncomplete(key_id: KeyId) { + description("incomplete key") + display("key (id={}) is incomplete", key_id.to_usize()) + } } } diff --git a/rust/radix204/src/lib.rs b/rust/radix204/src/lib.rs --- a/rust/radix204/src/lib.rs +++ b/rust/radix204/src/lib.rs @@ -23,3 +23,6 @@ pub mod types; pub mod errors; pub mod base16; +pub mod radix; + +pub type RadixBuffer = radix::RadixBuffer; diff --git a/rust/radix204/src/radix.rs b/rust/radix204/src/radix.rs new file mode 100644 --- /dev/null +++ b/rust/radix204/src/radix.rs @@ -0,0 +1,357 @@ +// Copyright 2017 Facebook, Inc. +// +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2 or any later version. + +//! Main radix buffer implementation that maintains `Key` to `KeyId` mapping. + +use std::ops::DerefMut; + +use base16::ToBase16Iter; +use errors::{Result, ErrorKind}; +use types::{Key, KeyId, Offset, Shared}; + +// KeyBuffer and RadixBuffer. +// They might be changed to type parameters. ex. they might be backed by mmap. +type KB = Vec; +type RB = Vec; + +/// Represents pointer to 16 offsets that are rewritable in-place. +struct RadixNode(Offset); + +/// The return value of `RadixNode::follow`. +enum FollowResult { + Radix(RadixNode), + KeyId(KeyId), + Missing, +} + +impl RadixNode { + #[inline] + pub fn new(offset: Offset) -> Self { RadixNode(offset) } + + #[inline] + pub fn create(vec: &mut RB) -> Result { + let mut pos = vec.len(); + // The LSB is reserved for marking leaf. So align pos by 2. + if pos & 1 == 1 { + pos += 1; + } + if pos > ::std::u32::MAX as usize { + return Err(ErrorKind::OffsetOverflow(pos).into()); + } + vec.resize(pos + 16, 0); + Ok(RadixNode(Offset::from(pos as u32))) + } + + /// Follow index by one step. + #[inline] + pub fn follow(&self, vec: &RB, index: u8) -> Result { + debug_assert!(index < 16); + let pos = self.0.to_usize() + usize::from(index); + let o = vec.get(pos); + match o { + None => Err(ErrorKind::OffsetOverflow(pos).into()), + Some(v) => { + let v = v.to_le(); + if v == 0 { + Ok(FollowResult::Missing) + } else if v & 1 != 0 { + Ok(FollowResult::KeyId((v >> 1).into())) + } else { + Ok(FollowResult::Radix(RadixNode::new(v.into()))) + } + } + } + } + + /// Rewrite an index entry to point to another `RadixNode`. + #[inline] + pub fn write_radix(&self, vec: &mut RB, index: u8, node: &RadixNode) -> Result<()> { + self.write_raw_offset(vec, index, node.0) + } + + /// Rewrite an index entry to point to a `KeyId`. + #[inline] + pub fn write_key_id(&self, vec: &mut RB, index: u8, key_id: KeyId) -> Result<()> { + self.write_raw_offset(vec, index, key_id.to_encoded_offset()?) + } + + #[inline] + fn write_raw_offset(&self, vec: &mut RB, index: u8, offset: Offset) -> Result<()> { + debug_assert!(index < 16); + let pos = self.0.to_usize() + usize::from(index); + if pos > vec.len() { + return Err(ErrorKind::OffsetOverflow(pos).into()); + } + let v: u32 = offset.into(); + vec[pos] = v.to_le(); + Ok(()) + } +} + +/// `RadixBuffer` provides efficient `Key` to `KeyId` lookups. It maintains two buffers: +/// - A key buffer for `Key` lookups. Caller is responsible for appending new keys there. +/// - A radix buffer for storing radix tree offsets. +pub struct RadixBuffer(Shared, Shared, Offset); + +impl RadixBuffer { + pub fn new(key_buf: Shared, radix_buf: Shared, root: Offset) -> Self { + RadixBuffer(key_buf, radix_buf, root) + } + + /// Lookup a `Key` + pub fn lookup(&self, key: &Key) -> Result> { + let result = self.lookup_unchecked(key); + if let Ok(Some(key_id)) = result { + // Double check if key_id matches + let key_to_check = self.read_key(key_id)?; + if key_to_check != *key { + return Ok(None); + } + } + result + } + + /// Find `KeyId` from a `Key`. Do not check if `KeyId` and `Key` matches. + pub fn lookup_unchecked(&self, key: &Key) -> Result> { + borrow_shared(&self.1, |rbuf| { + let mut radix = RadixNode::new(self.2); + for b in key.to_base16iter() { + match radix.follow(rbuf, b)? { + FollowResult::Radix(next_radix) => { + radix = next_radix; + } + FollowResult::Missing => { + return Ok(None); + } + FollowResult::KeyId(id) => return Ok(Some(id)), + } + } + Err(ErrorKind::KeyConflict.into()) + }) + } + + /// Insert when `KeyId` is known + pub fn insert(&self, new_key_id: KeyId) -> Result<()> { + let new_key = self.read_key(new_key_id)?; + self.insert_with_key(new_key_id, &new_key) + } + + /// Insert when both `KeyId` and `Key` are known + pub fn insert_with_key(&self, new_key_id: KeyId, new_key: &Key) -> Result<()> { + borrow_shared(&self.1, |rbuf| { + let mut radix = RadixNode::new(self.2); + for (i, b) in new_key.to_base16iter().enumerate() { + match radix.follow(rbuf, b)? { + FollowResult::Radix(next_radix) => { + radix = next_radix; + } + FollowResult::Missing => { + return radix.write_key_id(rbuf, b, new_key_id); + } + FollowResult::KeyId(old_key_id) => { + // No need to re-insert a same key + if old_key_id == new_key_id { + return Ok(()); + } + + // Need to do a leaf split + let old_key = self.read_key(old_key_id)?; + + // Find common prefix starting from the next base16 integer + let mut common = Vec::with_capacity(4); + + for (b1, b2) in old_key + .to_base16iter() + .zip(new_key.to_base16iter()) + .skip(i + 1) + { + if b1 == b2 { + // Not known what to do yet + common.push(b1); + } else { + // Got a chain of radix nodes to write back + let mut node = RadixNode::create(rbuf)?; + node.write_key_id(rbuf, b1, old_key_id)?; + node.write_key_id(rbuf, b2, new_key_id)?; + for &k in common.iter().rev() { + let new_node = RadixNode::create(rbuf)?; + new_node.write_radix(rbuf, k, &node)?; + node = new_node; + } + return radix.write_radix(rbuf, b, &node); + } + } + + // old_key is key, is a prefix of new_key, or vice-versa + return Err(ErrorKind::KeyConflict.into()); + } + } + } + Err(ErrorKind::KeyConflict.into()) + }) + } + + #[inline] + fn read_key(&self, key_id: KeyId) -> Result { + borrow_shared(&self.0, |kvec| { + let start_pos = key_id.to_usize(); + let end_pos = start_pos + 20; + if kvec.len() < end_pos { + Err(ErrorKind::KeyIncomplete(key_id).into()) + } else { + Ok(kvec[start_pos..end_pos].into()) + } + }) + } +} + +/// Borrow &mut B from Shared within func scope +#[inline] +pub fn borrow_shared(shared: &Shared, func: F) -> Result +where + F: FnOnce(&mut B) -> Result, +{ + let mut borrowed = shared.try_borrow_mut()?; + func(borrowed.deref_mut()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashSet; + use std::ops::Deref; + use std::rc::Rc; + use std::cell::RefCell; + use rand::{ChaChaRng, Rng}; + use test::Bencher; + use quickcheck::{Arbitrary, Gen}; + + type KB = Vec; + type RB = Vec; + + fn prepare_kbuf_radix() -> (Shared, Shared, RadixBuffer) { + let kbuf = Rc::new(RefCell::new(vec![])); + let rbuf = Rc::new(RefCell::new(vec![0u32; 17])); + let radix = RadixBuffer::new(Rc::clone(&kbuf), Rc::clone(&rbuf), 1.into()); + (kbuf, rbuf, radix) + } + + fn append_key(kbuf: &Shared, key: &Key) -> KeyId { + borrow_shared(&kbuf, |kvec| { + let pos = kvec.len(); + kvec.extend_from_slice(key.deref()); + Ok(KeyId::from(pos as u32)) + }).unwrap() + } + + fn check_duplicated_key_ids(key: Key) -> bool { + let (kbuf, _rbuf, radix) = prepare_kbuf_radix(); + let key_id1 = append_key(&kbuf, &key); + let key_id2 = append_key(&kbuf, &key); + radix.insert(key_id1).expect("insert"); + radix.insert(key_id2).unwrap_err().description() == "same key with different IDs" + } + + fn check_with_stdset(std_set: HashSet) -> bool { + let (kbuf, _rbuf, radix) = prepare_kbuf_radix(); + for &key in &std_set { + if radix.lookup(&key).unwrap().is_some() { + return false; + } + let key_id = append_key(&kbuf, &key); + radix.insert(key_id).expect("insert"); + radix.insert_with_key(key_id, &key).expect("insert"); + } + for &orig in &std_set { + for bit in 0..161 { + let mut key = orig; + if bit > 0 { + let mut buf: [u8; 20] = *orig; + buf[(bit - 1) / 8] ^= 1u8 << ((bit - 1) % 8); + key = buf.into(); + } + if std_set.contains(&key) ^ radix.lookup(&key).unwrap().is_some() { + return false; + } + } + } + true + } + + #[test] + fn test_errors() { + let (_kbuf, _rbuf, radix) = prepare_kbuf_radix(); + let key = Key::default(); + let r = radix.insert_with_key((1 << 31).into(), &key); + assert_eq!(r.unwrap_err().description(), "offset overflow"); + assert_eq!( + radix.insert(1.into()).unwrap_err().description(), + "incomplete key" + ); + } + + #[bench] + fn bench_50k_insert(b: &mut Bencher) { + const COUNT: u32 = 51200; + let key_buf: Vec = ChaChaRng::new_unseeded() + .gen_iter() + .take(COUNT as usize * Key::fixed_len()) + .collect(); + let kbuf = Rc::new(RefCell::new(key_buf)); + b.iter(|| { + let rbuf = Rc::new(RefCell::new(vec![0u32; 16])); + let radix = RadixBuffer::new(Rc::clone(&kbuf), Rc::clone(&rbuf), 0.into()); + for key_id in 0..COUNT { + let v = key_id * Key::fixed_len() as u32; + radix.insert(v.into()).expect("insert"); + } + }) + } + + fn prepare_radix_keys(count: usize) -> (RadixBuffer, Vec) { + let mut rng = ChaChaRng::new_unseeded(); + let (kbuf, _rbuf, radix) = prepare_kbuf_radix(); + let mut keys = vec![]; + for _ in 0..count { + let key = rng.gen(); + let key_id = append_key(&kbuf, &key); + radix.insert_with_key(key_id, &key).expect("insert"); + keys.push(key); + } + (radix, keys) + } + + #[bench] + fn bench_50k_fast_lookups(b: &mut Bencher) { + let (radix, keys) = prepare_radix_keys(51200); + b.iter(|| for key in &keys { + radix.lookup_unchecked(&key).expect("lookup"); + }) + } + + #[bench] + fn bench_50k_slow_lookups(b: &mut Bencher) { + let (radix, keys) = prepare_radix_keys(51200); + b.iter(|| for key in &keys { + radix.lookup(&key).expect("lookup"); + }) + } + + impl Arbitrary for Key { + fn arbitrary(gen: &mut G) -> Self { gen.gen() } + + fn shrink(&self) -> Box> { unimplemented!() } + } + + quickcheck! { + fn test_compare_with_stdset(std_set: HashSet) -> bool { + check_with_stdset(std_set) + } + + fn test_duplicated_key_ids(key: Key) -> bool { + check_duplicated_key_ids(key) + } + } +}