Details
Details
- Reviewers
Alphare - Group Reviewers
hg-reviewers - Commits
- rHGfa2b1a46d92e: encoding: remove Python 2 support code
Diff Detail
Diff Detail
- Repository
- rHG Mercurial
- Branch
- default
- Lint
No Linters Available - Unit
No Unit Test Coverage
( )
Alphare |
hg-reviewers |
No Linters Available |
No Unit Test Coverage |
Path | Packages | |||
---|---|---|---|---|
M | mercurial/encoding.py (88 lines) |
Commit | Parents | Author | Summary | Date |
---|---|---|---|---|
94199021b779 | 6839c9985319 | Gregory Szorc | Mar 3 2022, 10:58 AM |
isasciistr = charencode.isasciistr | isasciistr = charencode.isasciistr | ||||
asciilower = charencode.asciilower | asciilower = charencode.asciilower | ||||
asciiupper = charencode.asciiupper | asciiupper = charencode.asciiupper | ||||
_jsonescapeu8fast = charencode.jsonescapeu8fast | _jsonescapeu8fast = charencode.jsonescapeu8fast | ||||
_sysstr = pycompat.sysstr | _sysstr = pycompat.sysstr | ||||
if pycompat.ispy3: | |||||
unichr = chr | unichr = chr | ||||
# These unicode characters are ignored by HFS+ (Apple Technote 1150, | # These unicode characters are ignored by HFS+ (Apple Technote 1150, | ||||
# "Unicode Subtleties"), so we need to ignore them in some places for | # "Unicode Subtleties"), so we need to ignore them in some places for | ||||
# sanity. | # sanity. | ||||
_ignore = [ | _ignore = [ | ||||
unichr(int(x, 16)).encode("utf-8") | unichr(int(x, 16)).encode("utf-8") | ||||
for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e " | for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e " | ||||
b"206a 206b 206c 206d 206e 206f feff".split() | b"206a 206b 206c 206d 206e 206f feff".split() | ||||
if b"\xe2" in s or b"\xef" in s: | if b"\xe2" in s or b"\xef" in s: | ||||
for c in _ignore: | for c in _ignore: | ||||
s = s.replace(c, b'') | s = s.replace(c, b'') | ||||
return s | return s | ||||
# encoding.environ is provided read-only, which may not be used to modify | # encoding.environ is provided read-only, which may not be used to modify | ||||
# the process environment | # the process environment | ||||
_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ | _nativeenviron = os.supports_bytes_environ | ||||
if not pycompat.ispy3: | if _nativeenviron: | ||||
environ = os.environ # re-exports | |||||
elif _nativeenviron: | |||||
environ = os.environb # re-exports | environ = os.environb # re-exports | ||||
else: | else: | ||||
# preferred encoding isn't known yet; use utf-8 to avoid unicode error | # preferred encoding isn't known yet; use utf-8 to avoid unicode error | ||||
# and recreate it once encoding is settled | # and recreate it once encoding is settled | ||||
environ = { | environ = { | ||||
k.encode('utf-8'): v.encode('utf-8') | k.encode('utf-8'): v.encode('utf-8') | ||||
for k, v in os.environ.items() # re-exports | for k, v in os.environ.items() # re-exports | ||||
} | } | ||||
_encodingrewrites = { | _encodingrewrites = { | ||||
b'646': b'ascii', | b'646': b'ascii', | ||||
b'ANSI_X3.4-1968': b'ascii', | b'ANSI_X3.4-1968': b'ascii', | ||||
} | } | ||||
# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2. | # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2. | ||||
# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3. | # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3. | ||||
# https://bugs.python.org/issue13216 | # https://bugs.python.org/issue13216 | ||||
if pycompat.iswindows and not pycompat.ispy3: | if pycompat.iswindows: | ||||
_encodingrewrites[b'cp65001'] = b'utf-8' | _encodingrewrites[b'cp65001'] = b'utf-8' | ||||
try: | try: | ||||
encoding = environ.get(b"HGENCODING") | encoding = environ.get(b"HGENCODING") | ||||
if not encoding: | if not encoding: | ||||
encoding = locale.getpreferredencoding().encode('ascii') or b'ascii' | encoding = locale.getpreferredencoding().encode('ascii') or b'ascii' | ||||
encoding = _encodingrewrites.get(encoding, encoding) | encoding = _encodingrewrites.get(encoding, encoding) | ||||
except locale.Error: | except locale.Error: | ||||
return unifromlocal(bytesfunc(obj)) | return unifromlocal(bytesfunc(obj)) | ||||
return unifunc | return unifunc | ||||
# converter functions between native str and byte string. use these if the | # converter functions between native str and byte string. use these if the | ||||
# character encoding is not aware (e.g. exception message) or is known to | # character encoding is not aware (e.g. exception message) or is known to | ||||
# be locale dependent (e.g. date formatting.) | # be locale dependent (e.g. date formatting.) | ||||
if pycompat.ispy3: | |||||
strtolocal = unitolocal | strtolocal = unitolocal | ||||
strfromlocal = unifromlocal | strfromlocal = unifromlocal | ||||
strmethod = unimethod | strmethod = unimethod | ||||
else: | |||||
def strtolocal(s): | |||||
# type: (str) -> bytes | |||||
return s # pytype: disable=bad-return-type | |||||
def strfromlocal(s): | |||||
# type: (bytes) -> str | |||||
return s # pytype: disable=bad-return-type | |||||
strmethod = pycompat.identity | |||||
def lower(s): | def lower(s): | ||||
# type: (bytes) -> bytes | # type: (bytes) -> bytes | ||||
"""best-effort encoding-aware case-folding of local string s""" | """best-effort encoding-aware case-folding of local string s""" | ||||
try: | try: | ||||
return asciilower(s) | return asciilower(s) | ||||
except UnicodeDecodeError: | except UnicodeDecodeError: | ||||
raise error.Abort( | raise error.Abort( | ||||
pycompat.bytestr(k), hint=b"please check your locale settings" | pycompat.bytestr(k), hint=b"please check your locale settings" | ||||
) | ) | ||||
if not _nativeenviron: | if not _nativeenviron: | ||||
# now encoding and helper functions are available, recreate the environ | # now encoding and helper functions are available, recreate the environ | ||||
# dict to be exported to other modules | # dict to be exported to other modules | ||||
if pycompat.iswindows and pycompat.ispy3: | if pycompat.iswindows: | ||||
class WindowsEnviron(dict): | class WindowsEnviron(dict): | ||||
"""`os.environ` normalizes environment variables to uppercase on windows""" | """`os.environ` normalizes environment variables to uppercase on windows""" | ||||
def get(self, key, default=None): | def get(self, key, default=None): | ||||
return super().get(upper(key), default) | return super().get(upper(key), default) | ||||
environ = WindowsEnviron() | environ = WindowsEnviron() | ||||
for k, v in os.environ.items(): # re-exports | for k, v in os.environ.items(): # re-exports | ||||
environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8')) | environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8')) | ||||
DRIVE_RE = re.compile(b'^[a-z]:') | DRIVE_RE = re.compile(b'^[a-z]:') | ||||
if pycompat.ispy3: | |||||
# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which | # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which | ||||
# returns bytes. | # returns bytes. | ||||
if pycompat.iswindows: | if pycompat.iswindows: | ||||
# Python 3 on Windows issues a DeprecationWarning about using the bytes | # Python 3 on Windows issues a DeprecationWarning about using the bytes | ||||
# API when os.getcwdb() is called. | # API when os.getcwdb() is called. | ||||
# | # | ||||
# Additionally, py3.8+ uppercases the drive letter when calling | # Additionally, py3.8+ uppercases the drive letter when calling | ||||
# os.path.realpath(), which is used on ``repo.root``. Since those | # os.path.realpath(), which is used on ``repo.root``. Since those | ||||
# strings are compared in various places as simple strings, also call | # strings are compared in various places as simple strings, also call | ||||
# realpath here. See https://bugs.python.org/issue40368 | # realpath here. See https://bugs.python.org/issue40368 | ||||
# | # | ||||
# However this is not reliable, so lets explicitly make this drive | # However this is not reliable, so lets explicitly make this drive | ||||
# letter upper case. | # letter upper case. | ||||
# | # | ||||
# note: we should consider dropping realpath here since it seems to | # note: we should consider dropping realpath here since it seems to | ||||
# change the semantic of `getcwd`. | # change the semantic of `getcwd`. | ||||
def getcwd(): | def getcwd(): | ||||
cwd = os.getcwd() # re-exports | cwd = os.getcwd() # re-exports | ||||
cwd = os.path.realpath(cwd) | cwd = os.path.realpath(cwd) | ||||
cwd = strtolocal(cwd) | cwd = strtolocal(cwd) | ||||
if DRIVE_RE.match(cwd): | if DRIVE_RE.match(cwd): | ||||
cwd = cwd[0:1].upper() + cwd[1:] | cwd = cwd[0:1].upper() + cwd[1:] | ||||
return cwd | return cwd | ||||
else: | else: | ||||
getcwd = os.getcwdb # re-exports | getcwd = os.getcwdb # re-exports | ||||
else: | |||||
getcwd = os.getcwd # re-exports | |||||
# How to treat ambiguous-width characters. Set to 'wide' to treat as wide. | # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. | ||||
_wide = _sysstr( | _wide = _sysstr( | ||||
environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide" | environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide" | ||||
and b"WFA" | and b"WFA" | ||||
or b"WF" | or b"WF" | ||||
) | ) | ||||
return _jsonescapeu8fast(u8chars, paranoid) | return _jsonescapeu8fast(u8chars, paranoid) | ||||
except ValueError: | except ValueError: | ||||
pass | pass | ||||
return charencodepure.jsonescapeu8fallback(u8chars, paranoid) | return charencodepure.jsonescapeu8fallback(u8chars, paranoid) | ||||
# We need to decode/encode U+DCxx codes transparently since invalid UTF-8 | # We need to decode/encode U+DCxx codes transparently since invalid UTF-8 | ||||
# bytes are mapped to that range. | # bytes are mapped to that range. | ||||
if pycompat.ispy3: | |||||
_utf8strict = r'surrogatepass' | _utf8strict = r'surrogatepass' | ||||
else: | |||||
_utf8strict = r'strict' | |||||
_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] | _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] | ||||
def getutf8char(s, pos): | def getutf8char(s, pos): | ||||
# type: (bytes, int) -> bytes | # type: (bytes, int) -> bytes | ||||
"""get the next full utf-8 character in the given string, starting at pos | """get the next full utf-8 character in the given string, starting at pos | ||||