diff --git a/mercurial/revlogutils/flagutil.py b/mercurial/revlogutils/flagutil.py --- a/mercurial/revlogutils/flagutil.py +++ b/mercurial/revlogutils/flagutil.py @@ -32,6 +32,7 @@ REVIDX_FLAGS_ORDER REVIDX_RAWTEXT_CHANGING_FLAGS +# Keep this in sync with REVIDX_KNOWN_FLAGS in rust/hg-core/src/revlog/revlog.rs REVIDX_KNOWN_FLAGS = util.bitsfrom(REVIDX_FLAGS_ORDER) # Store flag processors (cf. 'addflagprocessor()' to register) diff --git a/rust/hg-core/src/revlog/filelog.rs b/rust/hg-core/src/revlog/filelog.rs --- a/rust/hg-core/src/revlog/filelog.rs +++ b/rust/hg-core/src/revlog/filelog.rs @@ -73,6 +73,89 @@ pub struct FilelogEntry<'a>(RevlogEntry<'a>); impl FilelogEntry<'_> { + /// `self.data()` can be expensive, with decompression and delta + /// resolution. + /// + /// *Without* paying this cost, based on revlog index information + /// including `RevlogEntry::uncompressed_len`: + /// + /// * Returns `true` if the length that `self.data().file_data().len()` + /// would return is definitely **not equal** to `other_len`. + /// * Returns `false` if available information is inconclusive. + pub fn file_data_len_not_equal_to(&self, other_len: u64) -> bool { + // Relevant code that implement this behavior in Python code: + // basefilectx.cmp, filelog.size, storageutil.filerevisioncopied, + // revlog.size, revlog.rawsize + + // Let’s call `file_data_len` what would be returned by + // `self.data().file_data().len()`. + + if self.0.is_cencored() { + let file_data_len = 0; + return other_len != file_data_len; + } + + if self.0.has_length_affecting_flag_processor() { + // We can’t conclude anything about `file_data_len`. + return false; + } + + // Revlog revisions (usually) have metadata for the size of + // their data after decompression and delta resolution + // as would be returned by `Revlog::get_rev_data`. + // + // For filelogs this is the file’s contents preceded by an optional + // metadata block. + let uncompressed_len = if let Some(l) = self.0.uncompressed_len() { + l as u64 + } else { + // The field was set to -1, the actual uncompressed len is unknown. + // We need to decompress to say more. + return false; + }; + // `uncompressed_len = file_data_len + optional_metadata_len`, + // so `file_data_len <= uncompressed_len`. + if uncompressed_len < other_len { + // Transitively, `file_data_len < other_len`. + // So `other_len != file_data_len` definitely. + return true; + } + + if uncompressed_len == other_len + 4 { + // It’s possible that `file_data_len == other_len` with an empty + // metadata block (2 start marker bytes + 2 end marker bytes). + // This happens when there wouldn’t otherwise be metadata, but + // the first 2 bytes of file data happen to match a start marker + // and would be ambiguous. + return false; + } + + if !self.0.has_p1() { + // There may or may not be copy metadata, so we can’t deduce more + // about `file_data_len` without computing file data. + return false; + } + + // Filelog ancestry is not meaningful in the way changelog ancestry is. + // It only provides hints to delta generation. + // p1 and p2 are set to null when making a copy or rename since + // contents are likely unrelatedto what might have previously existed + // at the destination path. + // + // Conversely, since here p1 is non-null, there is no copy metadata. + // Note that this reasoning may be invalidated in the presence of + // merges made by some previous versions of Mercurial that + // swapped p1 and p2. See + // and `tests/test-issue6528.t`. + // + // Since copy metadata is currently the only kind of metadata + // kept in revlog data of filelogs, + // this `FilelogEntry` does not have such metadata: + let file_data_len = uncompressed_len; + + return file_data_len != other_len; + } + pub fn data(&self) -> Result { Ok(FilelogRevisionData(self.0.data()?.into_owned())) } diff --git a/rust/hg-core/src/revlog/index.rs b/rust/hg-core/src/revlog/index.rs --- a/rust/hg-core/src/revlog/index.rs +++ b/rust/hg-core/src/revlog/index.rs @@ -260,6 +260,10 @@ } } + pub fn flags(&self) -> u16 { + BigEndian::read_u16(&self.bytes[6..=7]) + } + /// Return the compressed length of the data. pub fn compressed_len(&self) -> u32 { BigEndian::read_u32(&self.bytes[8..=11]) diff --git a/rust/hg-core/src/revlog/revlog.rs b/rust/hg-core/src/revlog/revlog.rs --- a/rust/hg-core/src/revlog/revlog.rs +++ b/rust/hg-core/src/revlog/revlog.rs @@ -20,6 +20,18 @@ use crate::revlog::Revision; use crate::{Node, NULL_REVISION}; +const REVISION_FLAG_CENSORED: u16 = 1 << 15; +const REVISION_FLAG_ELLIPSIS: u16 = 1 << 14; +const REVISION_FLAG_EXTSTORED: u16 = 1 << 13; +const REVISION_FLAG_HASCOPIESINFO: u16 = 1 << 12; + +// Keep this in sync with REVIDX_KNOWN_FLAGS in +// mercurial/revlogutils/flagutil.py +const REVIDX_KNOWN_FLAGS: u16 = REVISION_FLAG_CENSORED + | REVISION_FLAG_ELLIPSIS + | REVISION_FLAG_EXTSTORED + | REVISION_FLAG_HASCOPIESINFO; + #[derive(derive_more::From)] pub enum RevlogError { InvalidRevision, @@ -282,6 +294,7 @@ }, p1: index_entry.p1(), p2: index_entry.p2(), + flags: index_entry.flags(), hash: *index_entry.hash(), }; Ok(entry) @@ -309,6 +322,7 @@ base_rev_or_base_of_delta_chain: Option, p1: Revision, p2: Revision, + flags: u16, hash: Node, } @@ -321,6 +335,20 @@ u32::try_from(self.uncompressed_len).ok() } + pub fn has_p1(&self) -> bool { + self.p1 != NULL_REVISION + } + + pub fn is_cencored(&self) -> bool { + (self.flags & REVISION_FLAG_CENSORED) != 0 + } + + pub fn has_length_affecting_flag_processor(&self) -> bool { + // Relevant Python code: revlog.size() + // note: ELLIPSIS is known to not change the content + (self.flags & (REVIDX_KNOWN_FLAGS ^ REVISION_FLAG_ELLIPSIS)) != 0 + } + /// The data for this entry, after resolving deltas if any. pub fn data(&self) -> Result, HgError> { let mut entry = self.clone(); diff --git a/rust/rhg/src/commands/status.rs b/rust/rhg/src/commands/status.rs --- a/rust/rhg/src/commands/status.rs +++ b/rust/rhg/src/commands/status.rs @@ -516,16 +516,16 @@ filelog.entry_for_node(entry.node_id()?).map_err(|_| { HgError::corrupted("filelog missing node from manifest") })?; - // TODO: check `fs_len` here like below, but based on - // `RevlogEntry::uncompressed_len` without decompressing the full filelog - // contents where possible. This is only valid if the revlog data does not - // contain metadata. See how Python’s `revlog.rawsize` calls - // `storageutil.filerevisioncopied`. - // (Maybe also check for content-modifying flags? See `revlog.size`.) - let filelog_data = filelog_entry.data()?; - let contents_in_p1 = filelog_data.file_data()?; - if contents_in_p1.len() as u64 != fs_len { - // No need to read the file contents: + if filelog_entry.file_data_len_not_equal_to(fs_len) { + // No need to read file contents: + // it cannot be equal if it has a different length. + return Ok(true); + } + + let p1_filelog_data = filelog_entry.data()?; + let p1_contents = p1_filelog_data.file_data()?; + if p1_contents.len() as u64 != fs_len { + // No need to read file contents: // it cannot be equal if it has a different length. return Ok(true); } @@ -535,5 +535,5 @@ } else { vfs.read(fs_path)? }; - Ok(contents_in_p1 != &*fs_contents) + Ok(p1_contents != &*fs_contents) }