diff --git a/rust/Cargo.lock b/rust/Cargo.lock --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -358,6 +358,7 @@ "format-bytes", "home", "im-rc", + "itertools", "lazy_static", "log", "memmap", diff --git a/rust/hg-core/Cargo.toml b/rust/hg-core/Cargo.toml --- a/rust/hg-core/Cargo.toml +++ b/rust/hg-core/Cargo.toml @@ -14,6 +14,7 @@ derive_more = "0.99" home = "0.5" im-rc = "15.0.*" +itertools = "0.9" lazy_static = "1.4.0" rand = "0.7.3" rand_pcg = "0.2.1" diff --git a/rust/hg-core/src/dirstate.rs b/rust/hg-core/src/dirstate.rs --- a/rust/hg-core/src/dirstate.rs +++ b/rust/hg-core/src/dirstate.rs @@ -42,6 +42,19 @@ pub fn is_from_other_parent(&self) -> bool { self.state == EntryState::Normal && self.size == SIZE_FROM_OTHER_PARENT } + + // TODO: other platforms + #[cfg(unix)] + pub fn mode_changed( + &self, + filesystem_metadata: &std::fs::Metadata, + ) -> bool { + use std::os::unix::fs::MetadataExt; + const EXEC_BIT_MASK: u32 = 0o100; + let dirstate_exec_bit = (self.mode as u32) & EXEC_BIT_MASK; + let fs_exec_bit = filesystem_metadata.mode() & EXEC_BIT_MASK; + dirstate_exec_bit != fs_exec_bit + } } #[derive(BytesCast)] diff --git a/rust/hg-core/src/dirstate/status.rs b/rust/hg-core/src/dirstate/status.rs --- a/rust/hg-core/src/dirstate/status.rs +++ b/rust/hg-core/src/dirstate/status.rs @@ -95,7 +95,7 @@ type IoResult = std::io::Result; -/// `Box` is syntactic sugar for `Box`, so add +/// `Box` is syntactic sugar for `Box`, so add /// an explicit lifetime here to not fight `'static` bounds "out of nowhere". pub type IgnoreFnType<'a> = Box Fn(&'r HgPath) -> bool + Sync + 'a>; @@ -255,7 +255,7 @@ pub collect_traversed_dirs: bool, } -#[derive(Debug)] +#[derive(Debug, Default)] pub struct DirstateStatus<'a> { /// Tracked files whose contents have changed since the parent revision pub modified: Vec>, diff --git a/rust/hg-core/src/dirstate_tree/dirstate_map.rs b/rust/hg-core/src/dirstate_tree/dirstate_map.rs --- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs +++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs @@ -27,7 +27,7 @@ pub struct DirstateMap { parents: Option, dirty_parents: bool, - root: ChildNodes, + pub(super) root: ChildNodes, /// Number of nodes anywhere in the tree that have `.entry.is_some()`. nodes_with_entry_count: usize, @@ -42,17 +42,17 @@ /// path, so comparing full paths gives the same result as comparing base /// names. However `BTreeMap` would waste time always re-comparing the same /// string prefix. -type ChildNodes = BTreeMap, Node>; +pub(super) type ChildNodes = BTreeMap, Node>; /// Represents a file or a directory #[derive(Default)] -struct Node { +pub(super) struct Node { /// `None` for directories - entry: Option, + pub(super) entry: Option, - copy_source: Option, + pub(super) copy_source: Option, - children: ChildNodes, + pub(super) children: ChildNodes, /// How many (non-inclusive) descendants of this node are tracked files tracked_descendants_count: usize, @@ -67,6 +67,10 @@ false } } + + pub(super) fn state(&self) -> Option { + self.entry.as_ref().map(|entry| entry.state) + } } /// `(full_path, entry, copy_source)` diff --git a/rust/hg-core/src/dirstate_tree/status.rs b/rust/hg-core/src/dirstate_tree/status.rs --- a/rust/hg-core/src/dirstate_tree/status.rs +++ b/rust/hg-core/src/dirstate_tree/status.rs @@ -1,17 +1,379 @@ +use crate::dirstate::status::IgnoreFnType; +use crate::dirstate_tree::dirstate_map::ChildNodes; use crate::dirstate_tree::dirstate_map::DirstateMap; +use crate::dirstate_tree::dirstate_map::Node; +use crate::matchers::get_ignore_function; use crate::matchers::Matcher; +use crate::utils::files::get_bytes_from_os_string; +use crate::utils::hg_path::HgPath; use crate::DirstateStatus; +use crate::EntryState; +use crate::HgPathBuf; use crate::PatternFileWarning; use crate::StatusError; use crate::StatusOptions; +use std::borrow::Cow; +use std::io; +use std::path::Path; use std::path::PathBuf; -pub fn status<'a>( - _dmap: &'a mut DirstateMap, - _matcher: &'a (dyn Matcher + Sync), - _root_dir: PathBuf, - _ignore_files: Vec, - _options: StatusOptions, -) -> Result<(DirstateStatus<'a>, Vec), StatusError> { - todo!() +/// Returns the status of the working directory compared to its parent +/// changeset. +/// +/// This algorithm is based on traversing the filesystem tree (`fs` in function +/// and variable names) and dirstate tree at the same time. The core of this +/// traversal is the recursive `traverse_fs_directory_and_dirstate` function +/// and its use of `itertools::merge_join_by`. When reaching a path that only +/// exists in one of the two trees, depending on information requested by +/// `options` we may need to traverse the remaining subtree. +pub fn status<'tree>( + dmap: &'tree mut DirstateMap, + matcher: &(dyn Matcher + Sync), + root_dir: PathBuf, + ignore_files: Vec, + options: StatusOptions, +) -> Result<(DirstateStatus<'tree>, Vec), StatusError> { + let (ignore_fn, warnings): (IgnoreFnType, _) = + if options.list_ignored || options.list_unknown { + get_ignore_function(ignore_files, &root_dir)? + } else { + (Box::new(|&_| true), vec![]) + }; + + let mut common = StatusCommon { + options, + matcher, + ignore_fn, + outcome: DirstateStatus::default(), + }; + let is_at_repo_root = true; + let hg_path = HgPath::new(""); + let has_ignored_ancestor = false; + common.traverse_fs_directory_and_dirstate( + has_ignored_ancestor, + &mut dmap.root, + hg_path, + &root_dir, + is_at_repo_root, + ); + Ok((common.outcome, warnings)) +} + +/// Bag of random things needed by various parts of the algorithm. Reduces the +/// number of parameters passed to functions. +struct StatusCommon<'tree, 'a> { + options: StatusOptions, + matcher: &'a (dyn Matcher + Sync), + ignore_fn: IgnoreFnType<'a>, + outcome: DirstateStatus<'tree>, } + +impl<'tree, 'a> StatusCommon<'tree, 'a> { + fn traverse_fs_directory_and_dirstate( + &mut self, + has_ignored_ancestor: bool, + dirstate_nodes: &'tree mut ChildNodes, + directory_hg_path: &HgPath, + fs_path: &Path, + is_at_repo_root: bool, + ) { + // TODO: handle I/O errors + let mut fs_entries = + DirEntry::read_dir(fs_path, is_at_repo_root).unwrap(); + + // `merge_join_by` requires both its input iterators to be sorted: + + // * `BTreeMap` iterates according to keys’ ordering by definition + + // `sort_unstable_by_key` doesn’t allow keys borrowing from the value: + // https://github.com/rust-lang/rust/issues/34162 + fs_entries.sort_unstable_by(|e1, e2| e1.base_name.cmp(&e2.base_name)); + + for pair in itertools::merge_join_by( + dirstate_nodes, + &fs_entries, + |(full_path, _node), fs_entry| { + full_path.base_name().cmp(&fs_entry.base_name) + }, + ) { + use itertools::EitherOrBoth::*; + match pair { + Both((hg_path, dirstate_node), fs_entry) => { + self.traverse_fs_and_dirstate( + fs_entry, + hg_path.full_path(), + dirstate_node, + has_ignored_ancestor, + ); + } + Left((hg_path, dirstate_node)) => self.traverse_dirstate_only( + hg_path.full_path(), + dirstate_node, + ), + Right(fs_entry) => self.traverse_fs_only( + has_ignored_ancestor, + directory_hg_path, + fs_entry, + ), + } + } + } + + fn traverse_fs_and_dirstate( + &mut self, + fs_entry: &DirEntry, + hg_path: &'tree HgPath, + dirstate_node: &'tree mut Node, + has_ignored_ancestor: bool, + ) { + if fs_entry.metadata.is_dir() { + if self.options.collect_traversed_dirs { + self.outcome.traversed.push(hg_path.into()) + } + // If we previously had a file here, it was removed (with + // `hg rm` or similar) or deleted before it could be + // replaced by a directory. + self.mark_removed_or_deleted_if_file( + hg_path, + dirstate_node.state(), + ); + let is_ignored = has_ignored_ancestor || (self.ignore_fn)(hg_path); + let is_at_repo_root = false; + self.traverse_fs_directory_and_dirstate( + is_ignored, + &mut dirstate_node.children, + hg_path, + &fs_entry.full_path, + is_at_repo_root, + ); + } else { + if self.matcher.matches(hg_path) { + let full_path = Cow::from(hg_path); + if let Some(entry) = &dirstate_node.entry { + match entry.state { + EntryState::Added => { + self.outcome.added.push(full_path) + } + EntryState::Removed => { + self.outcome.removed.push(full_path) + } + EntryState::Merged => { + self.outcome.modified.push(full_path) + } + EntryState::Normal => { + self.handle_normal_file( + full_path, + dirstate_node, + entry, + fs_entry, + ); + } + // This variant is not used in DirstateMap + // nodes + EntryState::Unknown => unreachable!(), + } + } else { + // `node.entry.is_none()` indicates a "directory" + // node, but the filesystem has a file + self.mark_unknown_or_ignored( + has_ignored_ancestor, + full_path, + ) + } + } + + for (child_hg_path, child_node) in &mut dirstate_node.children { + self.traverse_dirstate_only( + child_hg_path.full_path(), + child_node, + ) + } + } + } + + /// A file with `EntryState::Normal` in the dirstate was found in the + /// filesystem + fn handle_normal_file( + &mut self, + full_path: Cow<'tree, HgPath>, + dirstate_node: &Node, + entry: &crate::DirstateEntry, + fs_entry: &DirEntry, + ) { + // Keep the low 31 bits + fn truncate_u64(value: u64) -> i32 { + (value & 0x7FFF_FFFF) as i32 + } + fn truncate_i64(value: i64) -> i32 { + (value & 0x7FFF_FFFF) as i32 + } + + let mode_changed = || { + self.options.check_exec && entry.mode_changed(&fs_entry.metadata) + }; + let size_changed = entry.size != truncate_u64(fs_entry.metadata.len()); + if entry.size >= 0 + && size_changed + && fs_entry.metadata.file_type().is_symlink() + { + // issue6456: Size returned may be longer due to encryption + // on EXT-4 fscrypt. TODO maybe only do it on EXT4? + self.outcome.unsure.push(full_path) + } else if dirstate_node.copy_source.is_some() + || entry.is_from_other_parent() + || (entry.size >= 0 && (size_changed || mode_changed())) + { + self.outcome.modified.push(full_path) + } else { + let mtime = mtime_seconds(&fs_entry.metadata); + if truncate_i64(mtime) != entry.mtime + || mtime == self.options.last_normal_time + { + self.outcome.unsure.push(full_path) + } else if self.options.list_clean { + self.outcome.clean.push(full_path) + } + } + } + + /// A node in the dirstate tree has no corresponding filesystem entry + fn traverse_dirstate_only( + &mut self, + hg_path: &'tree HgPath, + dirstate_node: &'tree mut Node, + ) { + self.mark_removed_or_deleted_if_file(hg_path, dirstate_node.state()); + for (child_hg_path, child_node) in &mut dirstate_node.children { + self.traverse_dirstate_only(child_hg_path.full_path(), child_node) + } + } + + /// A node in the dirstate tree has no corresponding *file* on the + /// filesystem + /// + /// Does nothing on a "directory" node + fn mark_removed_or_deleted_if_file( + &mut self, + hg_path: &'tree HgPath, + dirstate_node_state: Option, + ) { + if let Some(state) = dirstate_node_state { + if self.matcher.matches(hg_path) { + if let EntryState::Removed = state { + self.outcome.removed.push(hg_path.into()) + } else { + self.outcome.deleted.push(hg_path.into()) + } + } + } + } + + /// Something in the filesystem has no corresponding dirstate node + fn traverse_fs_only( + &mut self, + has_ignored_ancestor: bool, + directory_hg_path: &HgPath, + fs_entry: &DirEntry, + ) { + let hg_path = directory_hg_path.join(&fs_entry.base_name); + if fs_entry.metadata.is_dir() { + let is_ignored = + has_ignored_ancestor || (self.ignore_fn)(&hg_path); + let traverse_children = if is_ignored { + // Descendants of an ignored directory are all ignored + self.options.list_ignored + } else { + // Descendants of an unknown directory may be either unknown or + // ignored + self.options.list_unknown || self.options.list_ignored + }; + if traverse_children { + let is_at_repo_root = false; + // TODO: handle I/O errors + let children_fs_entries = + DirEntry::read_dir(&fs_entry.full_path, is_at_repo_root) + .unwrap(); + for child_fs_entry in children_fs_entries { + self.traverse_fs_only( + is_ignored, + &hg_path, + &child_fs_entry, + ) + } + } + if self.options.collect_traversed_dirs { + self.outcome.traversed.push(hg_path.into()) + } + } else if self.matcher.matches(&hg_path) { + self.mark_unknown_or_ignored(has_ignored_ancestor, hg_path.into()) + } + } + + fn mark_unknown_or_ignored( + &mut self, + has_ignored_ancestor: bool, + hg_path: Cow<'tree, HgPath>, + ) { + let is_ignored = has_ignored_ancestor || (self.ignore_fn)(&hg_path); + if is_ignored { + if self.options.list_ignored { + self.outcome.ignored.push(hg_path) + } + } else { + if self.options.list_unknown { + self.outcome.unknown.push(hg_path) + } + } + } +} + +#[cfg(unix)] // TODO +fn mtime_seconds(metadata: &std::fs::Metadata) -> i64 { + // Going through `Metadata::modified()` would be portable, but would take + // care to construct a `SystemTime` value with sub-second precision just + // for us to throw that away here. + use std::os::unix::fs::MetadataExt; + metadata.mtime() +} + +struct DirEntry { + base_name: HgPathBuf, + full_path: PathBuf, + metadata: std::fs::Metadata, +} + +impl DirEntry { + /// Returns **unsorted** entries in the given directory, with name and + /// metadata. + /// + /// If a `.hg` sub-directory is encountered: + /// + /// * At the repository root, ignore that sub-directory + /// * Elsewhere, we’re listing the content of a sub-repo. Return an empty + /// list instead. + fn read_dir(path: &Path, is_at_repo_root: bool) -> io::Result> { + let mut results = Vec::new(); + for entry in path.read_dir()? { + let entry = entry?; + let metadata = entry.metadata()?; + let name = get_bytes_from_os_string(entry.file_name()); + // FIXME don't do this when cached + if name == b".hg" { + if is_at_repo_root { + // Skip the repo’s own .hg (might be a symlink) + continue; + } else if metadata.is_dir() { + // A .hg sub-directory at another location means a subrepo, + // skip it entirely. + return Ok(Vec::new()); + } + } + results.push(DirEntry { + base_name: name.into(), + full_path: entry.path(), + metadata, + }) + } + Ok(results) + } +} diff --git a/rust/hg-core/src/utils/files.rs b/rust/hg-core/src/utils/files.rs --- a/rust/hg-core/src/utils/files.rs +++ b/rust/hg-core/src/utils/files.rs @@ -17,7 +17,7 @@ use lazy_static::lazy_static; use same_file::is_same_file; use std::borrow::{Cow, ToOwned}; -use std::ffi::OsStr; +use std::ffi::{OsStr, OsString}; use std::fs::Metadata; use std::iter::FusedIterator; use std::ops::Deref; @@ -53,6 +53,12 @@ str.as_ref().as_bytes().to_vec() } +#[cfg(unix)] +pub fn get_bytes_from_os_string(str: OsString) -> Vec { + use std::os::unix::ffi::OsStringExt; + str.into_vec() +} + /// An iterator over repository path yielding itself and its ancestors. #[derive(Copy, Clone, Debug)] pub struct Ancestors<'a> { diff --git a/rust/hg-core/src/utils/hg_path.rs b/rust/hg-core/src/utils/hg_path.rs --- a/rust/hg-core/src/utils/hg_path.rs +++ b/rust/hg-core/src/utils/hg_path.rs @@ -6,6 +6,7 @@ // GNU General Public License version 2 or any later version. use std::borrow::Borrow; +use std::borrow::Cow; use std::convert::TryFrom; use std::ffi::{OsStr, OsString}; use std::fmt; @@ -535,6 +536,24 @@ } } +impl From for Cow<'_, HgPath> { + fn from(path: HgPathBuf) -> Self { + Cow::Owned(path) + } +} + +impl<'a> From<&'a HgPath> for Cow<'a, HgPath> { + fn from(path: &'a HgPath) -> Self { + Cow::Borrowed(path) + } +} + +impl<'a> From<&'a HgPathBuf> for Cow<'a, HgPath> { + fn from(path: &'a HgPathBuf) -> Self { + Cow::Borrowed(&**path) + } +} + #[cfg(test)] mod tests { use super::*;