diff --git a/rust/hg-core/src/config/config.rs b/rust/hg-core/src/config/config.rs --- a/rust/hg-core/src/config/config.rs +++ b/rust/hg-core/src/config/config.rs @@ -388,6 +388,16 @@ }) } + /// If there is an `item` value in `section`, parse and return a list of + /// byte strings. + pub fn get_list( + &self, + section: &[u8], + item: &[u8], + ) -> Option>> { + self.get(section, item).map(values::parse_list) + } + /// Returns the raw value bytes of the first one found, or `None`. pub fn get(&self, section: &[u8], item: &[u8]) -> Option<&[u8]> { self.get_inner(section, item) diff --git a/rust/hg-core/src/config/values.rs b/rust/hg-core/src/config/values.rs --- a/rust/hg-core/src/config/values.rs +++ b/rust/hg-core/src/config/values.rs @@ -8,6 +8,8 @@ //! details about where the value came from (but omits details of what’s //! invalid inside the value). +use crate::utils::SliceExt; + pub(super) fn parse_bool(v: &[u8]) -> Option { match v.to_ascii_lowercase().as_slice() { b"1" | b"yes" | b"true" | b"on" | b"always" => Some(true), @@ -42,6 +44,216 @@ value.parse().ok() } +/// Parse a config value as a list of sub-values. +/// +/// Ported from `parselist` in `mercurial/utils/stringutil.py` + +// Note: keep behavior in sync with the Python one. + +// Note: this could return `Vec>` instead and borrow `input` when +// possible (when there’s no backslash-escapes) but this is probably not worth +// the complexity as config is presumably not accessed inside +// preformance-sensitive loops. +pub(super) fn parse_list(input: &[u8]) -> Vec> { + // Port of Python’s `value.lstrip(b' ,\n')` + // TODO: is this really what we want? + let input = + input.trim_start_matches(|b| b == b' ' || b == b',' || b == b'\n'); + parse_list_without_trim_start(input) +} + +fn parse_list_without_trim_start(input: &[u8]) -> Vec> { + // Start of port of Python’s `_configlist` + let input = input.trim_end_matches(|b| b == b' ' || b == b','); + if input.is_empty() { + return Vec::new(); + } + + // Just to make “a string” less confusable with “a list of strings”. + type ByteString = Vec; + + // These correspond to Python’s… + let mut mode = ParserMode::Plain; // `parser` + let mut values = Vec::new(); // `parts[:-1]` + let mut next_value = ByteString::new(); // `parts[-1]` + let mut offset = 0; // `offset` + + // Setting `parser` to `None` is instead handled by returning immediately + enum ParserMode { + Plain, + Quoted, + } + + loop { + match mode { + ParserMode::Plain => { + // Start of port of Python’s `_parse_plain` + let mut whitespace = false; + while let Some(&byte) = input.get(offset) { + if is_space(byte) || byte == b',' { + whitespace = true; + offset += 1; + } else { + break; + } + } + if let Some(&byte) = input.get(offset) { + if whitespace { + values.push(std::mem::take(&mut next_value)) + } + if byte == b'"' && next_value.is_empty() { + mode = ParserMode::Quoted; + } else { + if byte == b'"' && next_value.ends_with(b"\\") { + next_value.pop(); + } + next_value.push(byte); + } + offset += 1; + } else { + values.push(next_value); + return values; + } + } + ParserMode::Quoted => { + // Start of port of Python’s `_parse_quote` + if let Some(&byte) = input.get(offset) { + if byte == b'"' { + // The input contains a quoted zero-length value `""` + debug_assert_eq!(next_value, b""); + values.push(std::mem::take(&mut next_value)); + offset += 1; + while let Some(&byte) = input.get(offset) { + if is_space(byte) || byte == b',' { + offset += 1; + } else { + break; + } + } + mode = ParserMode::Plain; + continue; + } + } + + while let Some(&byte) = input.get(offset) { + if byte == b'"' { + break; + } + if byte == b'\\' && input.get(offset + 1) == Some(&b'"') { + next_value.push(b'"'); + offset += 2; + } else { + next_value.push(byte); + offset += 1; + } + } + + if offset >= input.len() { + // We didn’t find a closing double-quote, + // so treat the opening one as part of an unquoted value + // instead of delimiting the start of a quoted value. + + // `next_value` may have had some backslash-escapes + // unescaped. TODO: shouldn’t we use a slice of `input` + // instead? + let mut real_values = + parse_list_without_trim_start(&next_value); + + if let Some(first) = real_values.first_mut() { + first.insert(0, b'"'); + // Drop `next_value` + values.extend(real_values) + } else { + next_value.push(b'"'); + values.push(next_value); + } + return values; + } + + // We’re not at the end of the input, which means the `while` + // loop above ended at at double quote. Skip + // over that. + offset += 1; + + while let Some(&byte) = input.get(offset) { + if byte == b' ' || byte == b',' { + offset += 1; + } else { + break; + } + } + + if offset >= input.len() { + values.push(next_value); + return values; + } + + if offset + 1 == input.len() && input[offset] == b'"' { + next_value.push(b'"'); + offset += 1; + } else { + values.push(std::mem::take(&mut next_value)); + } + + mode = ParserMode::Plain; + } + } + } + + // https://docs.python.org/3/library/stdtypes.html?#bytes.isspace + fn is_space(byte: u8) -> bool { + if let b' ' | b'\t' | b'\n' | b'\r' | b'\x0b' | b'\x0c' = byte { + true + } else { + false + } + } +} + +#[test] +fn test_parse_list() { + // Make `assert_eq` error messages nicer + fn as_strings(values: &[Vec]) -> Vec { + values + .iter() + .map(|v| std::str::from_utf8(v.as_ref()).unwrap().to_owned()) + .collect() + } + macro_rules! assert_parse_list { + ( $input: expr => [ $( $output: expr ),* ] ) => { + assert_eq!( + as_strings(&parse_list($input)), + as_strings(&[ $( Vec::from(&$output[..]) ),* ]), + ); + } + } + + // Keep these Rust tests in sync with the Python ones in + // `tests/test-config-parselist.py` + assert_parse_list!(b"" => []); + assert_parse_list!(b"," => []); + assert_parse_list!(b"A" => [b"A"]); + assert_parse_list!(b"B,B" => [b"B", b"B"]); + assert_parse_list!(b", C, ,C," => [b"C", b"C"]); + assert_parse_list!(b"\"" => [b"\""]); + assert_parse_list!(b"\"\"" => [b"", b""]); + assert_parse_list!(b"D,\"" => [b"D", b"\""]); + assert_parse_list!(b"E,\"\"" => [b"E", b"", b""]); + assert_parse_list!(b"\"F,F\"" => [b"F,F"]); + assert_parse_list!(b"\"G,G" => [b"\"G", b"G"]); + assert_parse_list!(b"\"H \\\",\\\"H" => [b"\"H", b",", b"H"]); + assert_parse_list!(b"I,I\"" => [b"I", b"I\""]); + assert_parse_list!(b"J,\"J" => [b"J", b"\"J"]); + assert_parse_list!(b"K K" => [b"K", b"K"]); + assert_parse_list!(b"\"K\" K" => [b"K", b"K"]); + assert_parse_list!(b"L\tL" => [b"L", b"L"]); + assert_parse_list!(b"\"L\"\tL" => [b"L", b"", b"L"]); + assert_parse_list!(b"M\x0bM" => [b"M", b"M"]); + assert_parse_list!(b"\"M\"\x0bM" => [b"M", b"", b"M"]); + assert_parse_list!(b"\"N\" , ,\"" => [b"N\""]); + assert_parse_list!(b"\" ,O, " => [b"\"", b"O"]); +} + #[test] fn test_parse_byte_size() { assert_eq!(parse_byte_size(b""), None); diff --git a/tests/test-config-parselist.py b/tests/test-config-parselist.py new file mode 100644 --- /dev/null +++ b/tests/test-config-parselist.py @@ -0,0 +1,52 @@ +""" +List-valued configuration keys have an ad-hoc microsyntax. From `hg help config`: + +> List values are separated by whitespace or comma, except when values are +> placed in double quotation marks: +> +> allow_read = "John Doe, PhD", brian, betty +> +> Quotation marks can be escaped by prefixing them with a backslash. Only +> quotation marks at the beginning of a word is counted as a quotation +> (e.g., ``foo"bar baz`` is the list of ``foo"bar`` and ``baz``). + +That help documentation is fairly light on details, the actual parser has many +other edge cases. This test tries to cover them. +""" + +from mercurial.utils import stringutil + + +def assert_parselist(input, expected): + result = stringutil.parselist(input) + if result != expected: + raise AssertionError( + "parse_input(%r)\n got %r\nexpected %r" + % (input, result, expected) + ) + + +# Keep these Python tests in sync with the Rust ones in `rust/hg-core/src/config/values.rs` + +assert_parselist(b'', []) +assert_parselist(b',', []) +assert_parselist(b'A', [b'A']) +assert_parselist(b'B,B', [b'B', b'B']) +assert_parselist(b', C, ,C,', [b'C', b'C']) +assert_parselist(b'"', [b'"']) +assert_parselist(b'""', [b'', b'']) +assert_parselist(b'D,"', [b'D', b'"']) +assert_parselist(b'E,""', [b'E', b'', b'']) +assert_parselist(b'"F,F"', [b'F,F']) +assert_parselist(b'"G,G', [b'"G', b'G']) +assert_parselist(b'"H \\",\\"H', [b'"H', b',', b'H']) +assert_parselist(b'I,I"', [b'I', b'I"']) +assert_parselist(b'J,"J', [b'J', b'"J']) +assert_parselist(b'K K', [b'K', b'K']) +assert_parselist(b'"K" K', [b'K', b'K']) +assert_parselist(b'L\tL', [b'L', b'L']) +assert_parselist(b'"L"\tL', [b'L', b'', b'L']) +assert_parselist(b'M\x0bM', [b'M', b'M']) +assert_parselist(b'"M"\x0bM', [b'M', b'', b'M']) +assert_parselist(b'"N" , ,"', [b'N"']) +assert_parselist(b'" ,O, ', [b'"', b'O'])