diff --git a/rust/Cargo.lock b/rust/Cargo.lock --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -49,8 +49,10 @@ name = "hg-core" version = "0.1.0" dependencies = [ + "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", "rand_pcg 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -74,7 +76,7 @@ [[package]] name = "lazy_static" -version = "1.2.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -265,7 +267,7 @@ version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -310,7 +312,7 @@ "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" "checksum cpython 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b489034e723e7f5109fecd19b719e664f89ef925be785885252469e9822fa940" "checksum fuchsia-cprng 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "81f7f8eb465745ea9b02e2704612a9946a59fa40572086c6fd49d6ddcf30bf31" -"checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1" +"checksum lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bc5729f27f159ddd61f4df6228e827e86643d4d3e7c32183cb30a1c08f604a14" "checksum libc 0.2.45 (registry+https://github.com/rust-lang/crates.io-index)" = "2d2857ec59fadc0773853c664d2d18e7198e83883e7060b63c924cb077bd5c74" "checksum memchr 2.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "db4c41318937f6e76648f42826b1d9ade5c09cafb5aef7e351240a70f39206e9" "checksum num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1" diff --git a/rust/hg-core/Cargo.toml b/rust/hg-core/Cargo.toml --- a/rust/hg-core/Cargo.toml +++ b/rust/hg-core/Cargo.toml @@ -10,3 +10,7 @@ [dev-dependencies] rand = "*" rand_pcg = "*" + +[dependencies] +lazy_static = "1.3.0" +regex = "^1.1" \ No newline at end of file diff --git a/rust/hg-core/src/filepatterns.rs b/rust/hg-core/src/filepatterns.rs new file mode 100644 --- /dev/null +++ b/rust/hg-core/src/filepatterns.rs @@ -0,0 +1,346 @@ +use crate::{LineNumber, PatternError, PatternFileError}; +use regex::Regex; +use std::borrow::Cow; +use std::collections::HashMap; +use std::fs::File; +use std::io::Read; +use std::vec::Vec; + +lazy_static! { + static ref reescape: Vec> = { + let mut v: Vec> = (0..=255).map(|byte| vec![byte]).collect(); + let to_escape = b"()[]{}?*+-|^$\\.&~# \t\n\r\x0b\x0c"; + for byte in to_escape { + v[*byte as usize].insert(0, b'\\'); + } + v + }; +} + +const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] = + &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")]; + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum PatternSyntax { + Regexp, + RootGlob, + // Glob that matches at the front of the path + Glob, + // Glob that matches at any suffix of the path (still anchored at slashes) + Path, + RelPath, + RelGlob, + RelRegexp, + RootFiles, +} + +/// Transforms a glob pattern into a regex +fn glob_to_re(pat: &[u8]) -> Vec { + let mut input = pat; + let mut res: Vec = vec![]; + let mut group_depth = 0; + + while let Some((c, rest)) = input.split_first() { + input = rest; + + match c { + b'*' => { + for (source, repl) in GLOB_REPLACEMENTS { + if input.starts_with(source) { + input = &input[source.len()..]; + res.extend(repl.iter()); + break; + } + } + } + b'?' => res.extend(b"."), + b'[' => { + match input + .iter() + .enumerate() + .position(|(i, b)| *b == b']' && i != 0) + { + None => res.extend(b"\\["), + Some(end) => { + res.extend(b"["); + for (i, b) in input[..end].iter().enumerate() { + if *b == b'!' && i == 0 { + res.extend(b"^") + } else if *b == b'^' && i == 0 { + res.extend(b"\\^") + } else if *b == b'\\' { + res.extend(b"\\\\") + } else { + res.push(*b) + } + } + res.extend(b"]"); + input = &input[end + 1..]; + } + } + } + b'{' => { + group_depth += 1; + res.extend(b"(?:") + } + b'}' if group_depth > 0 => { + group_depth -= 1; + res.extend(b")"); + } + b',' if group_depth > 0 => res.extend(b"|"), + b'\\' => { + let c = { + if let Some((c, rest)) = input.split_first() { + input = rest; + c + } else { + c + } + }; + res.extend(&reescape[*c as usize]) + } + _ => res.extend(&reescape[*c as usize]), + } + } + res +} + +fn escape_pattern(pattern: &[u8]) -> Vec { + pattern + .iter() + .flat_map(|c| reescape[*c as usize].clone()) + .collect() +} + +fn parse_pattern_syntax(kind: &[u8]) -> Result { + match kind { + b"re" => Ok(PatternSyntax::Regexp), + b"path" => Ok(PatternSyntax::Path), + b"relpath" => Ok(PatternSyntax::RelPath), + b"rootfilesin" => Ok(PatternSyntax::RootFiles), + b"relglob" => Ok(PatternSyntax::RelGlob), + b"relre" => Ok(PatternSyntax::RelRegexp), + b"glob" => Ok(PatternSyntax::Glob), + b"rootglob" => Ok(PatternSyntax::RootGlob), + _ => Err(PatternError::UnsupportedSyntax( + String::from_utf8_lossy(kind).to_string(), + )), + } +} + +/// Builds the regex that corresponds to the given pattern. +/// If within a `syntax: regexp` context, returns the pattern, +/// otherwise, returns the corresponding regex. +fn _build_single_regex( + syntax: PatternSyntax, + pattern: &[u8], + globsuffix: &[u8], +) -> Vec { + if pattern.is_empty() { + return vec![]; + } + match syntax { + PatternSyntax::Regexp => pattern.to_owned(), + PatternSyntax::RelRegexp => { + if pattern[0] == b'^' { + return pattern.to_owned(); + } + let mut res = b".*".to_vec(); + res.extend(pattern); + res + } + PatternSyntax::Path | PatternSyntax::RelPath => { + if pattern == b"." { + return vec![]; + } + let mut pattern = escape_pattern(pattern); + pattern.extend(b"(?:/|$)"); + pattern + } + PatternSyntax::RootFiles => { + let mut res = if pattern == b"." { + vec![] + } else { + // Pattern is a directory name. + let mut as_vec: Vec = escape_pattern(pattern); + as_vec.push(b'/'); + as_vec + }; + + // Anything after the pattern must be a non-directory. + res.extend(b"[^/]+$"); + res + } + PatternSyntax::Glob + | PatternSyntax::RelGlob + | PatternSyntax::RootGlob => { + let mut res: Vec = vec![]; + if syntax == PatternSyntax::RelGlob { + res.extend(b"(?:|.*/)"); + } + + res.extend(glob_to_re(pattern)); + res.extend(globsuffix.iter()); + res + } + } +} + +const GLOB_SPECIAL_CHARACTERS: [u8; 7] = + [b'*', b'?', b'[', b']', b'{', b'}', b'\\']; + +/// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs +/// that don't need to be transformed into a regex. +pub fn build_single_regex( + kind: &str, + pat: &[u8], + globsuffix: &[u8], +) -> Result, PatternError> { + let enum_kind = parse_pattern_syntax(kind.as_bytes())?; + if enum_kind == PatternSyntax::RootGlob + && pat.iter().all(|b| GLOB_SPECIAL_CHARACTERS.contains(b)) + { + Ok(pat.to_vec()) + } else { + Ok(_build_single_regex(enum_kind, pat, globsuffix)) + } +} + +lazy_static! { + static ref SYNTAXES: HashMap<&'static str, &'static str> = { + let mut m = HashMap::new(); + + m.insert("re", "relre:"); + m.insert("regexp", "relre:"); + m.insert("glob", "relglob:"); + m.insert("rootglob", "rootglob:"); + m.insert("include", "include"); + m.insert("subinclude", "subinclude"); + m + }; +} + +pub type PatternTuple = (String, LineNumber, String); +type WarningTuple = (String, String); + +pub fn parse_pattern_file_contents( + lines: &str, + file_path: &str, + warn: bool, +) -> (Vec, Vec) { + let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap(); + let mut inputs: Vec = vec![]; + let mut warnings: Vec = vec![]; + + let mut current_syntax = Cow::Borrowed("relre:"); + + for (line_number, line) in lines.split('\n').enumerate() { + let line_number = line_number + 1; + let mut line = line.to_string(); + let mut syntax = current_syntax.to_string(); + let mut syntax = syntax.as_ref(); + + if line.contains('#') { + if let Some(cap) = comment_regex.captures(line.clone().as_ref()) { + line = line[..cap.get(1).unwrap().end()].to_string() + } + line = line.replace(r"\#", "#"); + } + line = str::trim_end(line.as_ref()).to_string(); + + if line.is_empty() { + continue; + } + + if line.starts_with("syntax:") { + syntax = str::trim(&line["syntax:".len()..]); + + if let Some(rel_syntax) = SYNTAXES.get(syntax) { + current_syntax = Cow::Owned(rel_syntax.to_string());; + } else if warn { + warnings.push((file_path.to_string(), syntax.to_string())); + } + continue; + } + + let mut line_syntax = syntax; + let mut final_line = line.clone(); + + for (s, rels) in SYNTAXES.iter() { + if final_line.starts_with(rels) { + line_syntax = rels; + final_line = line[rels.len()..].to_string(); + break; + } else if final_line.starts_with(&format!("{}:", s)) { + line_syntax = rels; + final_line = line[s.len() + 1..].to_string(); + break; + } + } + + inputs.push(( + line_syntax.to_string() + &final_line, + line_number, + final_line, + )); + current_syntax = Cow::Owned(syntax.to_string()); + } + (inputs, warnings) +} + +pub fn read_pattern_file( + file_path: String, + warn: bool, +) -> Result<(Vec, Vec), PatternFileError> { + let mut f = File::open(&file_path)?; + let mut contents = String::new(); + f.read_to_string(&mut contents).map(|_| { + Ok(parse_pattern_file_contents(&contents, &file_path, warn)) + })? +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn escape_pattern_test() { + let untouched = br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#; + assert_eq!(escape_pattern(untouched), untouched.to_vec()); + // All escape codes + assert_eq!( + escape_pattern(br#"()[]{}?*+-|^$\\.&~# \t\n\r\v\f"#), + br#"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\ \\t\\n\\r\\v\\f"# + .to_vec() + ); + } + + #[test] + fn glob_test() { + assert_eq!(glob_to_re(br#"?"#), br#"."#); + assert_eq!(glob_to_re(br#"*"#), br#"[^/]*"#); + assert_eq!(glob_to_re(br#"**"#), br#".*"#); + assert_eq!(glob_to_re(br#"**/a"#), br#"(?:.*/)?a"#); + assert_eq!(glob_to_re(br#"a/**/b"#), br#"a/(?:.*/)?b"#); + assert_eq!(glob_to_re(br#"[a*?!^][^b][!c]"#), br#"[a*?!^][\^b][^c]"#); + assert_eq!(glob_to_re(br#"{a,b}"#), br#"(?:a|b)"#); + assert_eq!(glob_to_re(br#".\*\?"#), br#"\.\*\?"#); + } + + #[test] + fn test_parse_pattern_file_contents() { + let lines = "syntax: glob\n*.elc"; + + assert_eq!( + vec![("relglob:*.elc".to_string(), 0, "*.elc".to_string())], + parse_pattern_file_contents(lines, "file_path", false).0, + ); + + let lines = "syntax: include\nsyntax: glob"; + + assert_eq!( + parse_pattern_file_contents(lines, "file_path", false).0, + vec![] + ); + } +} diff --git a/rust/hg-core/src/lib.rs b/rust/hg-core/src/lib.rs --- a/rust/hg-core/src/lib.rs +++ b/rust/hg-core/src/lib.rs @@ -4,8 +4,18 @@ // GNU General Public License version 2 or any later version. mod ancestors; pub mod dagops; + pub use ancestors::{AncestorsIterator, LazyAncestors, MissingAncestors}; pub mod testing; // unconditionally built, for use from integration tests +mod filepatterns; + +pub use filepatterns::{ + build_single_regex, read_pattern_file, PatternSyntax, PatternTuple +}; + +#[macro_use] +extern crate lazy_static; +extern crate regex; /// Mercurial revision numbers /// @@ -34,8 +44,27 @@ fn parents(&self, Revision) -> Result<[Revision; 2], GraphError>; } +pub type LineNumber = usize; + #[derive(Clone, Debug, PartialEq)] pub enum GraphError { ParentOutOfRange(Revision), WorkingDirectoryUnsupported, } + +#[derive(Debug)] +pub enum PatternError { + UnsupportedSyntax(String), +} + +#[derive(Debug)] +pub enum PatternFileError { + IO(std::io::Error), + Pattern(PatternError, LineNumber), +} + +impl From for PatternFileError { + fn from(e: std::io::Error) -> Self { + PatternFileError::IO(e) + } +} \ No newline at end of file