diff --git a/mercurial/dirstatemap.py b/mercurial/dirstatemap.py --- a/mercurial/dirstatemap.py +++ b/mercurial/dirstatemap.py @@ -657,8 +657,31 @@ return self._rustmap def write(self, tr, st, now): - if self._use_dirstate_v2: - packed = self._rustmap.write_v2(now) + if not self._use_dirstate_v2: + p1, p2 = self.parents() + packed = self._rustmap.write_v1(p1, p2, now) + st.write(packed) + st.close() + return + + # We can only append to an existing data file if there is one + can_append = self.docket.uuid is not None + packed, append = self._rustmap.write_v2(now, can_append) + if append: + docket = self.docket + with self._opener(docket.data_filename(), b'ab') as fp: + assert fp.tell() == docket.data_size, ( + fp.tell(), + docket.data_size, + ) + written = fp.write(packed) + if written is not None: # py2 return None + assert written == len(packed), (written, len(packed)) + docket.data_size += len(packed) + docket.parents = self.parents() + st.write(docket.serialize()) + st.close() + else: old_docket = self.docket new_docket = docketmod.DirstateDocket.with_new_uuid( self.parents(), len(packed) @@ -674,11 +697,8 @@ if old_docket.uuid: self._opener.unlink(old_docket.data_filename()) self._docket = new_docket - else: - p1, p2 = self.parents() - packed = self._rustmap.write_v1(p1, p2, now) - st.write(packed) - st.close() + # Reload from the newly-written file + util.clearcachedproperty(self, b"_rustmap") self._dirtyparents = False @propertycache diff --git a/rust/hg-core/src/dirstate_tree/dirstate_map.rs b/rust/hg-core/src/dirstate_tree/dirstate_map.rs --- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs +++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs @@ -468,6 +468,24 @@ Ok((map, parents)) } + /// Assuming dirstate-v2 format, returns whether the next write should + /// append to the existing data file that contains `self.on_disk` (true), + /// or create a new data file from scratch (false). + pub(super) fn write_should_append(&self) -> bool { + // Soon this will be a heuristic based on the amount of unreachable + // data. For now it’s pseudo-random in order to make tests exercise + // both code paths. + + fn bad_rng() -> u32 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .subsec_millis() + } + + bad_rng() % 2 == 0 + } + fn get_node<'tree>( &'tree self, path: &HgPath, @@ -1043,8 +1061,15 @@ Ok(packed) } + /// Returns new data together with whether that data should be appended to + /// the existing data file whose content is at `self.on_disk` (true), + /// instead of written to a new data file (false). #[timed] - fn pack_v2(&mut self, now: Timestamp) -> Result, DirstateError> { + fn pack_v2( + &mut self, + now: Timestamp, + can_append: bool, + ) -> Result<(Vec, bool), DirstateError> { // TODO: how do we want to handle this in 2038? let now: i32 = now.0.try_into().expect("time overflow"); let mut paths = Vec::new(); @@ -1063,7 +1088,7 @@ self.clear_known_ambiguous_mtimes(&paths)?; - on_disk::write(self) + on_disk::write(self, can_append) } fn status<'a>( diff --git a/rust/hg-core/src/dirstate_tree/dispatch.rs b/rust/hg-core/src/dirstate_tree/dispatch.rs --- a/rust/hg-core/src/dirstate_tree/dispatch.rs +++ b/rust/hg-core/src/dirstate_tree/dispatch.rs @@ -179,11 +179,19 @@ /// Clear mtimes that are ambigous with `now` (similar to /// `clear_ambiguous_times` but for all files in the dirstate map), and - /// serialize bytes to write the `.hg/dirstate` file to disk in dirstate-v2 + /// serialize bytes to write a dirstate data file to disk in dirstate-v2 /// format. /// + /// Returns new data together with whether that data should be appended to + /// the existing data file whose content is at `self.on_disk` (true), + /// instead of written to a new data file (false). + /// /// Note: this is only supported by the tree dirstate map. - fn pack_v2(&mut self, now: Timestamp) -> Result, DirstateError>; + fn pack_v2( + &mut self, + now: Timestamp, + can_append: bool, + ) -> Result<(Vec, bool), DirstateError>; /// Run the status algorithm. /// @@ -383,7 +391,11 @@ self.pack(parents, now) } - fn pack_v2(&mut self, _now: Timestamp) -> Result, DirstateError> { + fn pack_v2( + &mut self, + _now: Timestamp, + _can_append: bool, + ) -> Result<(Vec, bool), DirstateError> { panic!( "should have used dirstate_tree::DirstateMap to use the v2 format" ) diff --git a/rust/hg-core/src/dirstate_tree/on_disk.rs b/rust/hg-core/src/dirstate_tree/on_disk.rs --- a/rust/hg-core/src/dirstate_tree/on_disk.rs +++ b/rust/hg-core/src/dirstate_tree/on_disk.rs @@ -544,20 +544,28 @@ recur(on_disk, root.root_nodes, &mut f) } +/// Returns new data together with whether that data should be appended to the +/// existing data file whose content is at `dirstate_map.on_disk` (true), +/// instead of written to a new data file (false). pub(super) fn write( dirstate_map: &mut DirstateMap, -) -> Result, DirstateError> { - let root_len = std::mem::size_of::(); + can_append: bool, +) -> Result<(Vec, bool), DirstateError> { + let append = can_append && dirstate_map.write_should_append(); // This ignores the space for paths, and for nodes without an entry. // TODO: better estimate? Skip the `Vec` and write to a file directly? - let size_guess = root_len + let size_guess = std::mem::size_of::() + std::mem::size_of::() * dirstate_map.nodes_with_entry_count as usize; - let mut out = Vec::with_capacity(size_guess); - let root_nodes = - write_nodes(dirstate_map, dirstate_map.root.as_ref(), &mut out)?; + let mut writer = Writer { + dirstate_map, + append, + out: Vec::with_capacity(size_guess), + }; + + let root_nodes = writer.write_nodes(dirstate_map.root.as_ref())?; let root = Root { root_nodes, @@ -567,112 +575,121 @@ .into(), ignore_patterns_hash: dirstate_map.ignore_patterns_hash, }; - out.extend(root.as_bytes()); - Ok(out) + writer.out.extend(root.as_bytes()); + Ok((writer.out, append)) +} + +struct Writer<'dmap, 'on_disk> { + dirstate_map: &'dmap DirstateMap<'on_disk>, + append: bool, + out: Vec, } -fn write_nodes( - dirstate_map: &DirstateMap, - nodes: dirstate_map::ChildNodesRef, - out: &mut Vec, -) -> Result { - // `dirstate_map::ChildNodes` is a `HashMap` with undefined iteration - // order. Sort to enable binary search in the written file. - let nodes = nodes.sorted(); - let nodes_len = nodes.len(); +impl Writer<'_, '_> { + fn write_nodes( + &mut self, + nodes: dirstate_map::ChildNodesRef, + ) -> Result { + // `dirstate_map::ChildNodes` is a `HashMap` with undefined iteration + // order. Sort to enable binary search in the written file. + let nodes = nodes.sorted(); + let nodes_len = nodes.len(); - // First accumulate serialized nodes in a `Vec` - let mut on_disk_nodes = Vec::with_capacity(nodes_len); - for node in nodes { - let children = write_nodes( - dirstate_map, - node.children(dirstate_map.on_disk)?, - out, - )?; - let full_path = node.full_path(dirstate_map.on_disk)?; - let full_path = write_path(full_path.as_bytes(), out); - let copy_source = - if let Some(source) = node.copy_source(dirstate_map.on_disk)? { - write_path(source.as_bytes(), out) + // First accumulate serialized nodes in a `Vec` + let mut on_disk_nodes = Vec::with_capacity(nodes_len); + for node in nodes { + let children = + self.write_nodes(node.children(self.dirstate_map.on_disk)?)?; + let full_path = node.full_path(self.dirstate_map.on_disk)?; + let full_path = self.write_path(full_path.as_bytes()); + let copy_source = if let Some(source) = + node.copy_source(self.dirstate_map.on_disk)? + { + self.write_path(source.as_bytes()) } else { PathSlice { start: 0.into(), len: 0.into(), } }; - on_disk_nodes.push(match node { - NodeRef::InMemory(path, node) => { - let (state, data) = match &node.data { - dirstate_map::NodeData::Entry(entry) => ( - entry.state.into(), - Entry { - mode: entry.mode.into(), - mtime: entry.mtime.into(), - size: entry.size.into(), - }, - ), - dirstate_map::NodeData::CachedDirectory { mtime } => { - (b'd', Entry::from_timestamp(*mtime)) + on_disk_nodes.push(match node { + NodeRef::InMemory(path, node) => { + let (state, data) = match &node.data { + dirstate_map::NodeData::Entry(entry) => ( + entry.state.into(), + Entry { + mode: entry.mode.into(), + mtime: entry.mtime.into(), + size: entry.size.into(), + }, + ), + dirstate_map::NodeData::CachedDirectory { mtime } => { + (b'd', Entry::from_timestamp(*mtime)) + } + dirstate_map::NodeData::None => ( + b'\0', + Entry { + mode: 0.into(), + mtime: 0.into(), + size: 0.into(), + }, + ), + }; + Node { + children, + copy_source, + full_path, + base_name_start: u16::try_from(path.base_name_start()) + // Could only panic for paths over 64 KiB + .expect("dirstate-v2 path length overflow") + .into(), + descendants_with_entry_count: node + .descendants_with_entry_count + .into(), + tracked_descendants_count: node + .tracked_descendants_count + .into(), + state, + data, } - dirstate_map::NodeData::None => ( - b'\0', - Entry { - mode: 0.into(), - mtime: 0.into(), - size: 0.into(), - }, - ), - }; - Node { + } + NodeRef::OnDisk(node) => Node { children, copy_source, full_path, - base_name_start: u16::try_from(path.base_name_start()) - // Could only panic for paths over 64 KiB - .expect("dirstate-v2 path length overflow") - .into(), - descendants_with_entry_count: node - .descendants_with_entry_count - .into(), - tracked_descendants_count: node - .tracked_descendants_count - .into(), - state, - data, - } - } - NodeRef::OnDisk(node) => Node { - children, - copy_source, - full_path, - ..*node - }, - }) + ..*node + }, + }) + } + // … so we can write them contiguously, after writing everything else + // they refer to. + let start = self.curent_offset(); + let len = u32::try_from(nodes_len) + // Could only panic with over 4 billion nodes + .expect("dirstate-v2 path length overflow") + .into(); + self.out.extend(on_disk_nodes.as_bytes()); + Ok(ChildNodes { start, len }) } - // … so we can write them contiguously, after writing everything else they - // refer to. - let start = curent_offset(out); - let len = u32::try_from(nodes_len) - // Could only panic with over 4 billion nodes - .expect("dirstate-v2 path length overflow") - .into(); - out.extend(on_disk_nodes.as_bytes()); - Ok(ChildNodes { start, len }) -} -fn curent_offset(out: &Vec) -> Offset { - u32::try_from(out.len()) - // Could only panic for a dirstate file larger than 4 GiB - .expect("dirstate-v2 offset overflow") - .into() -} + fn curent_offset(&mut self) -> Offset { + let mut offset = self.out.len(); + if self.append { + offset += self.dirstate_map.on_disk.len() + } + u32::try_from(offset) + // Could only panic for a dirstate file larger than 4 GiB + .expect("dirstate-v2 offset overflow") + .into() + } -fn write_path(slice: &[u8], out: &mut Vec) -> PathSlice { - let start = curent_offset(out); - let len = u16::try_from(slice.len()) - // Could only panic for paths over 64 KiB - .expect("dirstate-v2 path length overflow") - .into(); - out.extend(slice.as_bytes()); - PathSlice { start, len } + fn write_path(&mut self, slice: &[u8]) -> PathSlice { + let start = self.curent_offset(); + let len = u16::try_from(slice.len()) + // Could only panic for paths over 64 KiB + .expect("dirstate-v2 path length overflow") + .into(); + self.out.extend(slice.as_bytes()); + PathSlice { start, len } + } } diff --git a/rust/hg-cpython/src/dirstate/dirstate_map.rs b/rust/hg-cpython/src/dirstate/dirstate_map.rs --- a/rust/hg-cpython/src/dirstate/dirstate_map.rs +++ b/rust/hg-cpython/src/dirstate/dirstate_map.rs @@ -340,16 +340,23 @@ } } + /// Returns new data together with whether that data should be appended to + /// the existing data file whose content is at `self.on_disk` (True), + /// instead of written to a new data file (False). def write_v2( &self, - now: PyObject - ) -> PyResult { + now: PyObject, + can_append: bool, + ) -> PyResult { let now = Timestamp(now.extract(py)?); let mut inner = self.inner(py).borrow_mut(); - let result = inner.pack_v2(now); + let result = inner.pack_v2(now, can_append); match result { - Ok(packed) => Ok(PyBytes::new(py, &packed)), + Ok((packed, append)) => { + let packed = PyBytes::new(py, &packed); + Ok((packed, append).to_py_object(py).into_object()) + }, Err(_) => Err(PyErr::new::( py, "Dirstate error".to_string(), diff --git a/rust/hg-cpython/src/dirstate/dispatch.rs b/rust/hg-cpython/src/dirstate/dispatch.rs --- a/rust/hg-cpython/src/dirstate/dispatch.rs +++ b/rust/hg-cpython/src/dirstate/dispatch.rs @@ -124,8 +124,12 @@ self.get_mut().pack_v1(parents, now) } - fn pack_v2(&mut self, now: Timestamp) -> Result, DirstateError> { - self.get_mut().pack_v2(now) + fn pack_v2( + &mut self, + now: Timestamp, + can_append: bool, + ) -> Result<(Vec, bool), DirstateError> { + self.get_mut().pack_v2(now, can_append) } fn status<'a>(