diff --git a/i18n/check-translation.py b/i18n/check-translation.py --- a/i18n/check-translation.py +++ b/i18n/check-translation.py @@ -224,14 +224,6 @@ failures, tests = doctest.testmod() sys.exit(failures and 1 or 0) - # replace polib._POFileParser to show linenum of problematic msgstr - class ExtPOFileParser(polib._POFileParser): - def process(self, symbol, linenum): - super(ExtPOFileParser, self).process(symbol, linenum) - if symbol == 'MS': # msgstr - self.current_entry.linenum = linenum - polib._POFileParser = ExtPOFileParser - detected = [] warning = options.warning for f in args: diff --git a/i18n/polib.py b/i18n/polib.py --- a/i18n/polib.py +++ b/i18n/polib.py @@ -1,5 +1,5 @@ -# -*- coding: utf-8 -*- # no-check-code +# -* coding: utf-8 -*- # # License: MIT (see LICENSE file provided) # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: @@ -15,10 +15,10 @@ from __future__ import absolute_import -__author__ = 'David Jean Louis ' -__version__ = '0.6.4' -__all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry', - 'detect_encoding', 'escape', 'unescape', 'detect_encoding',] +__author__ = 'David Jean Louis ' +__version__ = '1.0.7' +__all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry', + 'default_encoding', 'escape', 'unescape', 'detect_encoding', ] import array import codecs @@ -27,14 +27,47 @@ import struct import sys import textwrap -import types + +try: + import io +except ImportError: + # replacement of io.open() for python < 2.6 + # we use codecs instead + class io(object): + @staticmethod + def open(fpath, mode='r', encoding=None): + return codecs.open(fpath, mode, encoding) # the default encoding to use when encoding cannot be detected default_encoding = 'utf-8' +# python 2/3 compatibility helpers {{{ + + +if sys.version_info[:2] < (3, 0): + PY3 = False + text_type = unicode + + def b(s): + return s + + def u(s): + return unicode(s, "unicode_escape") + +else: + PY3 = True + text_type = str + + def b(s): + return s.encode("latin-1") + + def u(s): + return s +# }}} # _pofile_or_mofile {{{ + def _pofile_or_mofile(f, type, **kwargs): """ Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to @@ -50,15 +83,34 @@ parser = kls( f, encoding=enc, - check_for_duplicates=kwargs.get('check_for_duplicates', False) + check_for_duplicates=kwargs.get('check_for_duplicates', False), + klass=kwargs.get('klass') ) instance = parser.parse() instance.wrapwidth = kwargs.get('wrapwidth', 78) return instance +# }}} +# _is_file {{{ + +def _is_file(filename_or_contents): + """ + Safely returns the value of os.path.exists(filename_or_contents). + + Arguments: + + ``filename_or_contents`` + either a filename, or a string holding the contents of some file. + In the latter case, this function will always return False. + """ + try: + return os.path.exists(filename_or_contents) + except (ValueError, UnicodeEncodeError): + return False # }}} # function pofile() {{{ + def pofile(pofile, **kwargs): """ Convenience function that parses the po or pot file ``pofile`` and returns @@ -80,12 +132,17 @@ ``check_for_duplicates`` whether to check for duplicate entries when adding entries to the file (optional, default: ``False``). + + ``klass`` + class which is used to instantiate the return value (optional, + default: ``None``, the return value with be a :class:`~polib.POFile` + instance). """ return _pofile_or_mofile(pofile, 'pofile', **kwargs) - # }}} # function mofile() {{{ + def mofile(mofile, **kwargs): """ Convenience function that parses the mo file ``mofile`` and returns a @@ -108,12 +165,17 @@ ``check_for_duplicates`` whether to check for duplicate entries when adding entries to the file (optional, default: ``False``). + + ``klass`` + class which is used to instantiate the return value (optional, + default: ``None``, the return value with be a :class:`~polib.POFile` + instance). """ return _pofile_or_mofile(mofile, 'mofile', **kwargs) - # }}} # function detect_encoding() {{{ + def detect_encoding(file, binary_mode=False): """ Try to detect the encoding used by the ``file``. The ``file`` argument can @@ -129,7 +191,9 @@ ``binary_mode`` boolean, set this to True if ``file`` is a mo file. """ - rx = re.compile(r'"?Content-Type:.+? charset=([\w_\-:\.]+)') + PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)' + rxt = re.compile(u(PATTERN)) + rxb = re.compile(b(PATTERN)) def charset_exists(charset): """Check whether ``charset`` is valid or not.""" @@ -139,31 +203,36 @@ return False return True - if not os.path.exists(file): - match = rx.search(file) + if not _is_file(file): + match = rxt.search(file) if match: enc = match.group(1).strip() if charset_exists(enc): return enc else: - if binary_mode: + # For PY3, always treat as binary + if binary_mode or PY3: mode = 'rb' + rx = rxb else: mode = 'r' + rx = rxt f = open(file, mode) for l in f.readlines(): match = rx.search(l) if match: f.close() enc = match.group(1).strip() + if not isinstance(enc, text_type): + enc = enc.decode('utf-8') if charset_exists(enc): return enc f.close() return default_encoding - # }}} # function escape() {{{ + def escape(st): """ Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in @@ -174,10 +243,10 @@ .replace('\r', r'\r')\ .replace('\n', r'\n')\ .replace('\"', r'\"') - # }}} # function unescape() {{{ + def unescape(st): """ Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in @@ -193,12 +262,12 @@ return '\r' if m == '\\': return '\\' - return m # handles escaped double quote + return m # handles escaped double quote return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st) - # }}} # class _BaseFile {{{ + class _BaseFile(list): """ Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile` @@ -227,7 +296,7 @@ list.__init__(self) # the opened file handle pofile = kwargs.get('pofile', None) - if pofile and os.path.exists(pofile): + if pofile and _is_file(pofile): self.fpath = pofile else: self.fpath = kwargs.get('fpath') @@ -254,38 +323,45 @@ ret.append(entry.__unicode__(self.wrapwidth)) for entry in self.obsolete_entries(): ret.append(entry.__unicode__(self.wrapwidth)) - ret = '\n'.join(ret) + ret = u('\n').join(ret) - if type(ret) != types.UnicodeType: - return unicode(ret, self.encoding) + assert isinstance(ret, text_type) + #if type(ret) != text_type: + # return unicode(ret, self.encoding) return ret - def __str__(self): - """ - Returns the string representation of the file. - """ - return unicode(self).encode(self.encoding) + if PY3: + def __str__(self): + return self.__unicode__() + else: + def __str__(self): + """ + Returns the string representation of the file. + """ + return unicode(self).encode(self.encoding) def __contains__(self, entry): """ - Overriden ``list`` method to implement the membership test (in and + Overridden ``list`` method to implement the membership test (in and not in). The method considers that an entry is in the file if it finds an entry - that has the same msgid (the test is **case sensitive**). + that has the same msgid (the test is **case sensitive**) and the same + msgctxt (or none for both entries). Argument: ``entry`` an instance of :class:`~polib._BaseEntry`. """ - return self.find(entry.msgid, by='msgid') is not None + return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \ + is not None def __eq__(self, other): - return unicode(self) == unicode(other) + return str(self) == str(other) def append(self, entry): """ - Overriden method to check for duplicates entries, if a user tries to + Overridden method to check for duplicates entries, if a user tries to add an entry that is already in the file, the method will raise a ``ValueError`` exception. @@ -300,7 +376,7 @@ def insert(self, index, entry): """ - Overriden method to check for duplicates entries, if a user tries to + Overridden method to check for duplicates entries, if a user tries to add an entry that is already in the file, the method will raise a ``ValueError`` exception. @@ -332,7 +408,7 @@ e.flags.append('fuzzy') return e - def save(self, fpath=None, repr_method='__str__'): + def save(self, fpath=None, repr_method='__unicode__'): """ Saves the po file to ``fpath``. If it is an existing file and no ``fpath`` is provided, then the @@ -354,8 +430,8 @@ if repr_method == 'to_binary': fhandle = open(fpath, 'wb') else: - fhandle = codecs.open(fpath, 'w', self.encoding) - if type(contents) != types.UnicodeType: + fhandle = io.open(fpath, 'w', encoding=self.encoding) + if not isinstance(contents, text_type): contents = contents.decode(self.encoding) fhandle.write(contents) fhandle.close() @@ -381,7 +457,7 @@ boolean, whether to also search in entries that are obsolete. ``msgctxt`` - string, allows to specify a specific message context for the + string, allows specifying a specific message context for the search. """ if include_obsolete_entries: @@ -390,7 +466,7 @@ entries = [e for e in self if not e.obsolete] for e in entries: if getattr(e, by) == st: - if msgctxt and e.msgctxt != msgctxt: + if msgctxt is not False and e.msgctxt != msgctxt: continue return e return None @@ -412,7 +488,9 @@ 'Language-Team', 'MIME-Version', 'Content-Type', - 'Content-Transfer-Encoding' + 'Content-Transfer-Encoding', + 'Language', + 'Plural-Forms' ] ordered_data = [] for data in data_order: @@ -423,9 +501,7 @@ pass # the rest of the metadata will be alphabetically ordered since there # are no specs for this AFAIK - keys = metadata.keys() - keys.sort() - for data in keys: + for data in sorted(metadata.keys()): value = metadata[data] ordered_data.append((data, value)) return ordered_data @@ -436,18 +512,12 @@ """ offsets = [] entries = self.translated_entries() + # the keys are sorted in the .mo file def cmp(_self, other): # msgfmt compares entries with msgctxt if it exists - if _self.msgctxt: - self_msgid = _self.msgctxt - else: - self_msgid = _self.msgid - - if other.msgctxt: - other_msgid = other.msgctxt - else: - other_msgid = other.msgid + self_msgid = _self.msgctxt and _self.msgctxt or _self.msgid + other_msgid = other.msgctxt and other.msgctxt or other.msgid if self_msgid > other_msgid: return 1 elif self_msgid < other_msgid: @@ -455,25 +525,23 @@ else: return 0 # add metadata entry - entries.sort(cmp) + entries.sort(key=lambda o: o.msgctxt or o.msgid) mentry = self.metadata_as_entry() #mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip() entries = [mentry] + entries entries_len = len(entries) - ids, strs = '', '' + ids, strs = b(''), b('') for e in entries: # For each string, we need size and file offset. Each string is # NUL terminated; the NUL does not count into the size. - msgid = '' + msgid = b('') if e.msgctxt: # Contexts are stored by storing the concatenation of the # context, a byte, and the original string msgid = self._encode(e.msgctxt + '\4') if e.msgid_plural: - indexes = e.msgstr_plural.keys() - indexes.sort() msgstr = [] - for index in indexes: + for index in sorted(e.msgstr_plural.keys()): msgstr.append(e.msgstr_plural[index]) msgid += self._encode(e.msgid + '\0' + e.msgid_plural) msgstr = self._encode('\0'.join(msgstr)) @@ -481,11 +549,11 @@ msgid += self._encode(e.msgid) msgstr = self._encode(e.msgstr) offsets.append((len(ids), len(msgid), len(strs), len(msgstr))) - ids += msgid + '\0' - strs += msgstr + '\0' + ids += msgid + b('\0') + strs += msgstr + b('\0') # The header is 7 32-bit unsigned integers. - keystart = 7*4+16*entries_len + keystart = 7 * 4 + 16 * entries_len # and the values start after the keys valuestart = keystart + len(ids) koffsets = [] @@ -493,26 +561,30 @@ # The string table first has the list of keys, then the list of values. # Each entry has first the size of the string, then the file offset. for o1, l1, o2, l2 in offsets: - koffsets += [l1, o1+keystart] - voffsets += [l2, o2+valuestart] + koffsets += [l1, o1 + keystart] + voffsets += [l2, o2 + valuestart] offsets = koffsets + voffsets - # check endianness for magic number - if struct.pack('@h', 1) == struct.pack(' 1: # python 3.2 or superior + output += array.array("i", offsets).tobytes() + else: + output += array.array("i", offsets).tostring() output += ids output += strs return output @@ -522,13 +594,13 @@ Encodes the given ``mixed`` argument with the file encoding if and only if it's an unicode string and returns the encoded string. """ - if type(mixed) == types.UnicodeType: - return mixed.encode(self.encoding) + if isinstance(mixed, text_type): + mixed = mixed.encode(self.encoding) return mixed - # }}} # class POFile {{{ + class POFile(_BaseFile): """ Po (or Pot) file reader/writer. @@ -542,13 +614,15 @@ """ ret, headers = '', self.header.split('\n') for header in headers: - if header[:1] in [',', ':']: + if not len(header): + ret += "#\n" + elif header[:1] in [',', ':']: ret += '#%s\n' % header else: ret += '# %s\n' % header - if type(ret) != types.UnicodeType: - ret = unicode(ret, self.encoding) + if not isinstance(ret, text_type): + ret = ret.decode(self.encoding) return ret + _BaseFile.__unicode__(self) @@ -572,7 +646,7 @@ if total == 0: return 100 translated = len(self.translated_entries()) - return int((100.00 / float(total)) * translated) + return int(translated * 100 / float(total)) def translated_entries(self): """ @@ -584,7 +658,7 @@ """ Convenience method that returns the list of untranslated entries. """ - return [e for e in self if not e.translated() and not e.obsolete \ + return [e for e in self if not e.translated() and not e.obsolete and not 'fuzzy' in e.flags] def fuzzy_entries(self): @@ -615,28 +689,32 @@ ``refpot`` object POFile, the reference catalog. """ + # Store entries in dict/set for faster access + self_entries = dict((entry.msgid, entry) for entry in self) + refpot_msgids = set(entry.msgid for entry in refpot) + # Merge entries that are in the refpot for entry in refpot: - e = self.find(entry.msgid, include_obsolete_entries=True) + e = self_entries.get(entry.msgid) if e is None: e = POEntry() self.append(e) e.merge(entry) # ok, now we must "obsolete" entries that are not in the refpot anymore for entry in self: - if refpot.find(entry.msgid) is None: + if entry.msgid not in refpot_msgids: entry.obsolete = True - # }}} # class MOFile {{{ + class MOFile(_BaseFile): """ Mo file reader/writer. This class inherits the :class:`~polib._BaseFile` class and, by extension, the python ``list`` type. """ - BIG_ENDIAN = 0xde120495 - LITTLE_ENDIAN = 0x950412de + MAGIC = 0x950412de + MAGIC_SWAPPED = 0xde120495 def __init__(self, *args, **kwargs): """ @@ -698,10 +776,10 @@ Convenience method to keep the same interface with POFile instances. """ return [] - # }}} # class _BaseEntry {{{ + class _BaseEntry(object): """ Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes. @@ -753,12 +831,14 @@ ret = [] # write the msgctxt if any if self.msgctxt is not None: - ret += self._str_field("msgctxt", delflag, "", self.msgctxt, wrapwidth) + ret += self._str_field("msgctxt", delflag, "", self.msgctxt, + wrapwidth) # write the msgid ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth) # write the msgid_plural if any if self.msgid_plural: - ret += self._str_field("msgid_plural", delflag, "", self.msgid_plural, wrapwidth) + ret += self._str_field("msgid_plural", delflag, "", + self.msgid_plural, wrapwidth) if self.msgstr_plural: # write the msgstr_plural if any msgstrs = self.msgstr_plural @@ -767,30 +847,34 @@ for index in keys: msgstr = msgstrs[index] plural_index = '[%s]' % index - ret += self._str_field("msgstr", delflag, plural_index, msgstr, wrapwidth) + ret += self._str_field("msgstr", delflag, plural_index, msgstr, + wrapwidth) else: # otherwise write the msgstr - ret += self._str_field("msgstr", delflag, "", self.msgstr, wrapwidth) + ret += self._str_field("msgstr", delflag, "", self.msgstr, + wrapwidth) ret.append('') - ret = '\n'.join(ret) - - if type(ret) != types.UnicodeType: - return unicode(ret, self.encoding) + ret = u('\n').join(ret) return ret - def __str__(self): - """ - Returns the string representation of the entry. - """ - return unicode(self).encode(self.encoding) + if PY3: + def __str__(self): + return self.__unicode__() + else: + def __str__(self): + """ + Returns the string representation of the entry. + """ + return unicode(self).encode(self.encoding) def __eq__(self, other): - return unicode(self) == unicode(other) + return str(self) == str(other) - def _str_field(self, fieldname, delflag, plural_index, field, wrapwidth=78): + def _str_field(self, fieldname, delflag, plural_index, field, + wrapwidth=78): lines = field.splitlines(True) if len(lines) > 1: - lines = [''] + lines # start with initial empty line + lines = [''] + lines # start with initial empty line else: escaped_field = escape(field) specialchars_count = 0 @@ -804,9 +888,9 @@ real_wrapwidth = wrapwidth - flength + specialchars_count if wrapwidth > 0 and len(field) > real_wrapwidth: # Wrap the line but take field name into account - lines = [''] + [unescape(item) for item in textwrap.wrap( + lines = [''] + [unescape(item) for item in wrap( escaped_field, - wrapwidth - 2, # 2 for quotes "" + wrapwidth - 2, # 2 for quotes "" drop_whitespace=False, break_long_words=False )] @@ -818,13 +902,13 @@ ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index, escape(lines.pop(0)))] - for mstr in lines: - ret.append('%s"%s"' % (delflag, escape(mstr))) + for line in lines: + ret.append('%s"%s"' % (delflag, escape(line))) return ret - # }}} # class POEntry {{{ + class POEntry(_BaseEntry): """ Represents a po file entry. @@ -854,6 +938,9 @@ ``previous_msgid_plural`` string, the entry previous msgid_plural. + + ``linenum`` + integer, the line number of the entry """ _BaseEntry.__init__(self, *args, **kwargs) self.comment = kwargs.get('comment', '') @@ -863,6 +950,7 @@ self.previous_msgctxt = kwargs.get('previous_msgctxt', None) self.previous_msgid = kwargs.get('previous_msgid', None) self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None) + self.linenum = kwargs.get('linenum', None) def __unicode__(self, wrapwidth=78): """ @@ -879,7 +967,7 @@ if val: for comment in val.split('\n'): if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth: - ret += textwrap.wrap( + ret += wrap( comment, wrapwidth, initial_indent=c[1], @@ -903,7 +991,7 @@ # what we want for filenames, so the dirty hack is to # temporally replace hyphens with a char that a file cannot # contain, like "*" - ret += [l.replace('*', '-') for l in textwrap.wrap( + ret += [l.replace('*', '-') for l in wrap( filestr.replace('-', '*'), wrapwidth, initial_indent='#: ', @@ -918,32 +1006,25 @@ ret.append('#, %s' % ', '.join(self.flags)) # previous context and previous msgid/msgid_plural - fields = ['previous_msgctxt', 'previous_msgid', 'previous_msgid_plural'] + fields = ['previous_msgctxt', 'previous_msgid', + 'previous_msgid_plural'] for f in fields: val = getattr(self, f) if val: ret += self._str_field(f, "#| ", "", val, wrapwidth) ret.append(_BaseEntry.__unicode__(self, wrapwidth)) - ret = '\n'.join(ret) + ret = u('\n').join(ret) - if type(ret) != types.UnicodeType: - return unicode(ret, self.encoding) + assert isinstance(ret, text_type) + #if type(ret) != types.UnicodeType: + # return unicode(ret, self.encoding) return ret def __cmp__(self, other): """ Called by comparison operations if rich comparison is not defined. """ - def compare_occurrences(a, b): - """ - Compare an entry occurrence with another one. - """ - if a[0] != b[0]: - return a[0] < b[0] - if a[1] != b[1]: - return a[1] < b[1] - return 0 # First: Obsolete test if self.obsolete != other.obsolete: @@ -952,12 +1033,8 @@ else: return 1 # Work on a copy to protect original - occ1 = self.occurrences[:] - occ2 = other.occurrences[:] - # Sorting using compare method - occ1.sort(compare_occurrences) - occ2.sort(compare_occurrences) - # Comparing sorted occurrences + occ1 = sorted(self.occurrences[:]) + occ2 = sorted(other.occurrences[:]) pos = 0 for entry1 in occ1: try: @@ -975,9 +1052,41 @@ return 1 else: return -1 + # Compare msgid_plural if set + if self.msgid_plural: + if not other.msgid_plural: + return 1 + for pos in self.msgid_plural: + if pos not in other.msgid_plural: + return 1 + if self.msgid_plural[pos] > other.msgid_plural[pos]: + return 1 + if self.msgid_plural[pos] < other.msgid_plural[pos]: + return -1 # Finally: Compare message ID - if self.msgid > other.msgid: return 1 - else: return -1 + if self.msgid > other.msgid: + return 1 + elif self.msgid < other.msgid: + return -1 + return 0 + + def __gt__(self, other): + return self.__cmp__(other) > 0 + + def __lt__(self, other): + return self.__cmp__(other) < 0 + + def __ge__(self, other): + return self.__cmp__(other) >= 0 + + def __le__(self, other): + return self.__cmp__(other) <= 0 + + def __eq__(self, other): + return self.__cmp__(other) == 0 + + def __ne__(self, other): + return self.__cmp__(other) != 0 def translated(self): """ @@ -1020,18 +1129,49 @@ except KeyError: self.msgstr_plural[pos] = '' + def __hash__(self): + return hash((self.msgid, self.msgstr)) # }}} # class MOEntry {{{ + class MOEntry(_BaseEntry): """ Represents a mo file entry. """ - pass + def __init__(self, *args, **kwargs): + """ + Constructor, accepts the following keyword arguments, + for consistency with :class:`~polib.POEntry`: + + ``comment`` + ``tcomment`` + ``occurrences`` + ``flags`` + ``previous_msgctxt`` + ``previous_msgid`` + ``previous_msgid_plural`` + + Note: even though these keyword arguments are accepted, + they hold no real meaning in the context of MO files + and are simply ignored. + """ + _BaseEntry.__init__(self, *args, **kwargs) + self.comment = '' + self.tcomment = '' + self.occurrences = [] + self.flags = [] + self.previous_msgctxt = None + self.previous_msgid = None + self.previous_msgid_plural = None + + def __hash__(self): + return hash((self.msgid, self.msgstr)) # }}} # class _POFileParser {{{ + class _POFileParser(object): """ A finite state machine to parse efficiently and correctly po @@ -1056,23 +1196,27 @@ file (optional, default: ``False``). """ enc = kwargs.get('encoding', default_encoding) - if os.path.exists(pofile): + if _is_file(pofile): try: - self.fhandle = codecs.open(pofile, 'rU', enc) + self.fhandle = io.open(pofile, 'rt', encoding=enc) except LookupError: enc = default_encoding - self.fhandle = codecs.open(pofile, 'rU', enc) + self.fhandle = io.open(pofile, 'rt', encoding=enc) else: self.fhandle = pofile.splitlines() - self.instance = POFile( + klass = kwargs.get('klass') + if klass is None: + klass = POFile + self.instance = klass( pofile=pofile, encoding=enc, check_for_duplicates=kwargs.get('check_for_duplicates', False) ) self.transitions = {} - self.current_entry = POEntry() - self.current_state = 'ST' + self.current_line = 0 + self.current_entry = POEntry(linenum=self.current_line) + self.current_state = 'st' self.current_token = None # two memo flags used in handlers self.msgstr_index = 0 @@ -1083,7 +1227,7 @@ # * HE: Header # * TC: a translation comment # * GC: a generated comment - # * OC: a file/line occurence + # * OC: a file/line occurrence # * FL: a flags line # * CT: a message context # * PC: a previous msgctxt @@ -1094,48 +1238,47 @@ # * MS: a msgstr # * MX: a msgstr plural # * MC: a msgid or msgstr continuation line - all = ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'PC', 'PM', 'PP', 'TC', - 'MS', 'MP', 'MX', 'MI'] + all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc', + 'ms', 'mp', 'mx', 'mi'] - self.add('TC', ['ST', 'HE'], 'HE') - self.add('TC', ['GC', 'OC', 'FL', 'TC', 'PC', 'PM', 'PP', 'MS', - 'MP', 'MX', 'MI'], 'TC') - self.add('GC', all, 'GC') - self.add('OC', all, 'OC') - self.add('FL', all, 'FL') - self.add('PC', all, 'PC') - self.add('PM', all, 'PM') - self.add('PP', all, 'PP') - self.add('CT', ['ST', 'HE', 'GC', 'OC', 'FL', 'TC', 'PC', 'PM', - 'PP', 'MS', 'MX'], 'CT') - self.add('MI', ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'TC', 'PC', - 'PM', 'PP', 'MS', 'MX'], 'MI') - self.add('MP', ['TC', 'GC', 'PC', 'PM', 'PP', 'MI'], 'MP') - self.add('MS', ['MI', 'MP', 'TC'], 'MS') - self.add('MX', ['MI', 'MX', 'MP', 'TC'], 'MX') - self.add('MC', ['CT', 'MI', 'MP', 'MS', 'MX', 'PM', 'PP', 'PC'], 'MC') + self.add('tc', ['st', 'he'], 'he') + self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', + 'mp', 'mx', 'mi'], 'tc') + self.add('gc', all, 'gc') + self.add('oc', all, 'oc') + self.add('fl', all, 'fl') + self.add('pc', all, 'pc') + self.add('pm', all, 'pm') + self.add('pp', all, 'pp') + self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm', + 'pp', 'ms', 'mx'], 'ct') + self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc', + 'pm', 'pp', 'ms', 'mx'], 'mi') + self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp') + self.add('ms', ['mi', 'mp', 'tc'], 'ms') + self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx') + self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc') def parse(self): """ Run the state machine, parse the file line by line and call process() with the current matched symbol. """ - i = 0 keywords = { - 'msgctxt': 'CT', - 'msgid': 'MI', - 'msgstr': 'MS', - 'msgid_plural': 'MP', + 'msgctxt': 'ct', + 'msgid': 'mi', + 'msgstr': 'ms', + 'msgid_plural': 'mp', } prev_keywords = { - 'msgid_plural': 'PP', - 'msgid': 'PM', - 'msgctxt': 'PC', + 'msgid_plural': 'pp', + 'msgid': 'pm', + 'msgctxt': 'pc', } - + tokens = [] for line in self.fhandle: - i += 1 + self.current_line += 1 line = line.strip() if line == '': continue @@ -1143,6 +1286,9 @@ tokens = line.split(None, 2) nb_tokens = len(tokens) + if tokens[0] == '#~|': + continue + if tokens[0] == '#~' and nb_tokens > 1: line = line[3:].strip() tokens = tokens[1:] @@ -1155,41 +1301,56 @@ # msgid, msgid_plural, msgctxt & msgstr. if tokens[0] in keywords and nb_tokens > 1: line = line[len(tokens[0]):].lstrip() + if re.search(r'([^\\]|^)"', line[1:-1]): + raise IOError('Syntax error in po file %s (line %s): ' + 'unescaped double quote found' % + (self.instance.fpath, self.current_line)) self.current_token = line - self.process(keywords[tokens[0]], i) + self.process(keywords[tokens[0]]) continue self.current_token = line - if tokens[0] == '#:' and nb_tokens > 1: + if tokens[0] == '#:': + if nb_tokens <= 1: + continue # we are on a occurrences line - self.process('OC', i) + self.process('oc') elif line[:1] == '"': # we are on a continuation line - self.process('MC', i) + if re.search(r'([^\\]|^)"', line[1:-1]): + raise IOError('Syntax error in po file %s (line %s): ' + 'unescaped double quote found' % + (self.instance.fpath, self.current_line)) + self.process('mc') elif line[:7] == 'msgstr[': # we are on a msgstr plural - self.process('MX', i) + self.process('mx') - elif tokens[0] == '#,' and nb_tokens > 1: + elif tokens[0] == '#,': + if nb_tokens <= 1: + continue # we are on a flags line - self.process('FL', i) + self.process('fl') - elif tokens[0] == '#': - if line == '#': line += ' ' + elif tokens[0] == '#' or tokens[0].startswith('##'): + if line == '#': + line += ' ' # we are on a translator comment line - self.process('TC', i) + self.process('tc') - elif tokens[0] == '#.' and nb_tokens > 1: + elif tokens[0] == '#.': + if nb_tokens <= 1: + continue # we are on a generated comment line - self.process('GC', i) + self.process('gc') elif tokens[0] == '#|': - if nb_tokens < 2: - self.process('??', i) - continue + if nb_tokens <= 1: + raise IOError('Syntax error in po file %s (line %s)' % + (self.instance.fpath, self.current_line)) # Remove the marker and any whitespace right after that. line = line[2:].lstrip() @@ -1197,48 +1358,57 @@ if tokens[1].startswith('"'): # Continuation of previous metadata. - self.process('MC', i) + self.process('mc') continue if nb_tokens == 2: # Invalid continuation line. - self.process('??', i) + raise IOError('Syntax error in po file %s (line %s): ' + 'invalid continuation line' % + (self.instance.fpath, self.current_line)) # we are on a "previous translation" comment line, if tokens[1] not in prev_keywords: # Unknown keyword in previous translation comment. - self.process('??', i) + raise IOError('Syntax error in po file %s (line %s): ' + 'unknown keyword %s' % + (self.instance.fpath, self.current_line, + tokens[1])) # Remove the keyword and any whitespace # between it and the starting quote. line = line[len(tokens[1]):].lstrip() self.current_token = line - self.process(prev_keywords[tokens[1]], i) + self.process(prev_keywords[tokens[1]]) else: - self.process('??', i) + raise IOError('Syntax error in po file %s (line %s)' % + (self.instance.fpath, self.current_line)) - if self.current_entry: + if self.current_entry and len(tokens) > 0 and \ + not tokens[0].startswith('#'): # since entries are added when another entry is found, we must add - # the last entry here (only if there are lines) + # the last entry here (only if there are lines). Trailing comments + # are ignored self.instance.append(self.current_entry) + # before returning the instance, check if there's metadata and if # so extract it in a dict - firstentry = self.instance[0] - if firstentry.msgid == '': # metadata found + metadataentry = self.instance.find('') + if metadataentry: # metadata found # remove the entry - firstentry = self.instance.pop(0) - self.instance.metadata_is_fuzzy = firstentry.flags + self.instance.remove(metadataentry) + self.instance.metadata_is_fuzzy = metadataentry.flags key = None - for msg in firstentry.msgstr.splitlines(): + for msg in metadataentry.msgstr.splitlines(): try: key, val = msg.split(':', 1) self.instance.metadata[key] = val.strip() - except: + except (ValueError, KeyError): if key is not None: - self.instance.metadata[key] += '\n'+ msg.strip() + self.instance.metadata[key] += '\n' + msg.strip() # close opened file - if isinstance(self.fhandle, file): + if not isinstance(self.fhandle, list): # must be file self.fhandle.close() return self.instance @@ -1258,10 +1428,10 @@ the next state the fsm will have after the action. """ for state in states: - action = getattr(self, 'handle_%s' % next_state.lower()) + action = getattr(self, 'handle_%s' % next_state) self.transitions[(symbol, state)] = (action, next_state) - def process(self, symbol, linenum): + def process(self, symbol): """ Process the transition corresponding to the current state and the symbol provided. @@ -1278,8 +1448,9 @@ (action, state) = self.transitions[(symbol, self.current_state)] if action(): self.current_state = state - except Exception as exc: - raise IOError('Syntax error in po file (line %s)' % linenum) + except Exception: + raise IOError('Syntax error in po file (line %s)' % + self.current_line) # state handlers @@ -1292,90 +1463,94 @@ def handle_tc(self): """Handle a translator comment.""" - if self.current_state in ['MC', 'MS', 'MX']: + if self.current_state in ['mc', 'ms', 'mx']: self.instance.append(self.current_entry) - self.current_entry = POEntry() + self.current_entry = POEntry(linenum=self.current_line) if self.current_entry.tcomment != '': self.current_entry.tcomment += '\n' - self.current_entry.tcomment += self.current_token[2:] + tcomment = self.current_token.lstrip('#') + if tcomment.startswith(' '): + tcomment = tcomment[1:] + self.current_entry.tcomment += tcomment return True def handle_gc(self): """Handle a generated comment.""" - if self.current_state in ['MC', 'MS', 'MX']: + if self.current_state in ['mc', 'ms', 'mx']: self.instance.append(self.current_entry) - self.current_entry = POEntry() + self.current_entry = POEntry(linenum=self.current_line) if self.current_entry.comment != '': self.current_entry.comment += '\n' self.current_entry.comment += self.current_token[3:] return True def handle_oc(self): - """Handle a file:num occurence.""" - if self.current_state in ['MC', 'MS', 'MX']: + """Handle a file:num occurrence.""" + if self.current_state in ['mc', 'ms', 'mx']: self.instance.append(self.current_entry) - self.current_entry = POEntry() + self.current_entry = POEntry(linenum=self.current_line) occurrences = self.current_token[3:].split() for occurrence in occurrences: if occurrence != '': try: fil, line = occurrence.split(':') if not line.isdigit(): - fil = fil + line + fil = fil + line line = '' self.current_entry.occurrences.append((fil, line)) - except: + except (ValueError, AttributeError): self.current_entry.occurrences.append((occurrence, '')) return True def handle_fl(self): """Handle a flags line.""" - if self.current_state in ['MC', 'MS', 'MX']: + if self.current_state in ['mc', 'ms', 'mx']: self.instance.append(self.current_entry) - self.current_entry = POEntry() - self.current_entry.flags += self.current_token[3:].split(', ') + self.current_entry = POEntry(linenum=self.current_line) + self.current_entry.flags += [c.strip() for c in + self.current_token[3:].split(',')] return True def handle_pp(self): """Handle a previous msgid_plural line.""" - if self.current_state in ['MC', 'MS', 'MX']: + if self.current_state in ['mc', 'ms', 'mx']: self.instance.append(self.current_entry) - self.current_entry = POEntry() + self.current_entry = POEntry(linenum=self.current_line) self.current_entry.previous_msgid_plural = \ unescape(self.current_token[1:-1]) return True def handle_pm(self): """Handle a previous msgid line.""" - if self.current_state in ['MC', 'MS', 'MX']: + if self.current_state in ['mc', 'ms', 'mx']: self.instance.append(self.current_entry) - self.current_entry = POEntry() + self.current_entry = POEntry(linenum=self.current_line) self.current_entry.previous_msgid = \ unescape(self.current_token[1:-1]) return True def handle_pc(self): """Handle a previous msgctxt line.""" - if self.current_state in ['MC', 'MS', 'MX']: + if self.current_state in ['mc', 'ms', 'mx']: self.instance.append(self.current_entry) - self.current_entry = POEntry() + self.current_entry = POEntry(linenum=self.current_line) self.current_entry.previous_msgctxt = \ unescape(self.current_token[1:-1]) return True def handle_ct(self): """Handle a msgctxt.""" - if self.current_state in ['MC', 'MS', 'MX']: + if self.current_state in ['mc', 'ms', 'mx']: self.instance.append(self.current_entry) - self.current_entry = POEntry() + self.current_entry = POEntry(linenum=self.current_line) self.current_entry.msgctxt = unescape(self.current_token[1:-1]) return True def handle_mi(self): """Handle a msgid.""" - if self.current_state in ['MC', 'MS', 'MX']: + if self.current_state in ['mc', 'ms', 'mx']: self.instance.append(self.current_entry) - self.current_entry = POEntry() + self.current_entry = POEntry(linenum=self.current_line) self.current_entry.obsolete = self.entry_obsolete self.current_entry.msgid = unescape(self.current_token[1:-1]) return True @@ -1392,47 +1567,37 @@ def handle_mx(self): """Handle a msgstr plural.""" - index, value = self.current_token[7], self.current_token[11:-1] - self.current_entry.msgstr_plural[index] = unescape(value) - self.msgstr_index = index + index = self.current_token[7] + value = self.current_token[self.current_token.find('"') + 1:-1] + self.current_entry.msgstr_plural[int(index)] = unescape(value) + self.msgstr_index = int(index) return True def handle_mc(self): """Handle a msgid or msgstr continuation line.""" token = unescape(self.current_token[1:-1]) - if self.current_state == 'CT': - typ = 'msgctxt' + if self.current_state == 'ct': self.current_entry.msgctxt += token - elif self.current_state == 'MI': - typ = 'msgid' + elif self.current_state == 'mi': self.current_entry.msgid += token - elif self.current_state == 'MP': - typ = 'msgid_plural' + elif self.current_state == 'mp': self.current_entry.msgid_plural += token - elif self.current_state == 'MS': - typ = 'msgstr' + elif self.current_state == 'ms': self.current_entry.msgstr += token - elif self.current_state == 'MX': - typ = 'msgstr[%s]' % self.msgstr_index + elif self.current_state == 'mx': self.current_entry.msgstr_plural[self.msgstr_index] += token - elif self.current_state == 'PP': - typ = 'previous_msgid_plural' - token = token[3:] + elif self.current_state == 'pp': self.current_entry.previous_msgid_plural += token - elif self.current_state == 'PM': - typ = 'previous_msgid' - token = token[3:] + elif self.current_state == 'pm': self.current_entry.previous_msgid += token - elif self.current_state == 'PC': - typ = 'previous_msgctxt' - token = token[3:] + elif self.current_state == 'pc': self.current_entry.previous_msgctxt += token # don't change the current state return False - # }}} # class _MOFileParser {{{ + class _MOFileParser(object): """ A class to parse binary mo files. @@ -1456,12 +1621,24 @@ file (optional, default: ``False``). """ self.fhandle = open(mofile, 'rb') - self.instance = MOFile( + + klass = kwargs.get('klass') + if klass is None: + klass = MOFile + self.instance = klass( fpath=mofile, encoding=kwargs.get('encoding', default_encoding), check_for_duplicates=kwargs.get('check_for_duplicates', False) ) + def __del__(self): + """ + Make sure the file is closed, this prevents warnings on unclosed file + when running tests with python >= 3.2. + """ + if self.fhandle: + self.fhandle.close() + def parse(self): """ Build the instance with the file handle provided in the @@ -1469,15 +1646,20 @@ """ # parse magic number magic_number = self._readbinary(' 1: entry = self._build_entry( msgid=msgid_tokens[0], msgid_plural=msgid_tokens[1], - msgstr_plural=dict((k,v) for k,v in enumerate(msgstr.split('\0'))) + msgstr_plural=dict((k, v) for k, v in + enumerate(msgstr.split(b('\0')))) ) else: entry = self._build_entry(msgid=msgid, msgstr=msgstr) @@ -1524,19 +1711,22 @@ def _build_entry(self, msgid, msgstr=None, msgid_plural=None, msgstr_plural=None): - msgctxt_msgid = msgid.split('\x04') + msgctxt_msgid = msgid.split(b('\x04')) + encoding = self.instance.encoding if len(msgctxt_msgid) > 1: kwargs = { - 'msgctxt': msgctxt_msgid[0], - 'msgid' : msgctxt_msgid[1], + 'msgctxt': msgctxt_msgid[0].decode(encoding), + 'msgid': msgctxt_msgid[1].decode(encoding), } else: - kwargs = {'msgid': msgid} + kwargs = {'msgid': msgid.decode(encoding)} if msgstr: - kwargs['msgstr'] = msgstr + kwargs['msgstr'] = msgstr.decode(encoding) if msgid_plural: - kwargs['msgid_plural'] = msgid_plural + kwargs['msgid_plural'] = msgid_plural.decode(encoding) if msgstr_plural: + for k in msgstr_plural: + msgstr_plural[k] = msgstr_plural[k].decode(encoding) kwargs['msgstr_plural'] = msgstr_plural return MOEntry(**kwargs) @@ -1550,5 +1740,99 @@ if len(tup) == 1: return tup[0] return tup +# }}} +# class TextWrapper {{{ + + +class TextWrapper(textwrap.TextWrapper): + """ + Subclass of textwrap.TextWrapper that backport the + drop_whitespace option. + """ + def __init__(self, *args, **kwargs): + drop_whitespace = kwargs.pop('drop_whitespace', True) + textwrap.TextWrapper.__init__(self, *args, **kwargs) + self.drop_whitespace = drop_whitespace + + def _wrap_chunks(self, chunks): + """_wrap_chunks(chunks : [string]) -> [string] + + Wrap a sequence of text chunks and return a list of lines of + length 'self.width' or less. (If 'break_long_words' is false, + some lines may be longer than this.) Chunks correspond roughly + to words and the whitespace between them: each chunk is + indivisible (modulo 'break_long_words'), but a line break can + come between any two chunks. Chunks should not have internal + whitespace; ie. a chunk is either all whitespace or a "word". + Whitespace chunks will be removed from the beginning and end of + lines, but apart from that whitespace is preserved. + """ + lines = [] + if self.width <= 0: + raise ValueError("invalid width %r (must be > 0)" % self.width) + + # Arrange in reverse order so items can be efficiently popped + # from a stack of chucks. + chunks.reverse() + + while chunks: + + # Start the list of chunks that will make up the current line. + # cur_len is just the length of all the chunks in cur_line. + cur_line = [] + cur_len = 0 + + # Figure out which static string will prefix this line. + if lines: + indent = self.subsequent_indent + else: + indent = self.initial_indent + + # Maximum width for this line. + width = self.width - len(indent) + + # First chunk on line is whitespace -- drop it, unless this + # is the very beginning of the text (ie. no lines started yet). + if self.drop_whitespace and chunks[-1].strip() == '' and lines: + del chunks[-1] + + while chunks: + l = len(chunks[-1]) + + # Can at least squeeze this chunk onto the current line. + if cur_len + l <= width: + cur_line.append(chunks.pop()) + cur_len += l + + # Nope, this line is full. + else: + break + + # The current line is full, and the next chunk is too big to + # fit on *any* line (not just this one). + if chunks and len(chunks[-1]) > width: + self._handle_long_word(chunks, cur_line, cur_len, width) + + # If the last chunk on this line is all whitespace, drop it. + if self.drop_whitespace and cur_line and not cur_line[-1].strip(): + del cur_line[-1] + + # Convert current line back to a string and store it in list + # of all lines (return value). + if cur_line: + lines.append(indent + ''.join(cur_line)) + + return lines +# }}} +# function wrap() {{{ + + +def wrap(text, width=70, **kwargs): + """ + Wrap a single paragraph of text, returning a list of wrapped lines. + """ + if sys.version_info < (2, 6): + return TextWrapper(width=width, **kwargs).wrap(text) + return textwrap.wrap(text, width=width, **kwargs) # }}}