rnd_gettext_impr(20m ago) make roundup_gettext scan for translations in extensions and detectors diff --git a/doc/developers.txt b/doc/developers.txt --- a/doc/developers.txt +++ b/doc/developers.txt @@ -463,7 +463,8 @@ roundup-gettext -extracting translatable messages from tracker's html templates. +extracting translatable messages from tracker's html templates and +detectors / extensions (assuming `polib`_ is installed) This utility creates message template file ``messages.pot`` in ``locale`` subdirectory of the tracker home directory. Translated messages may be put in *locale*.po files (where *locale* is selected @@ -573,6 +574,7 @@ .. _po filetype plugin: https://vim.sourceforge.io/scripts/script.php?script_id=695 .. _PO utilities: https://github.com/pinard/po-utils +.. _polib: https://polib.readthedocs.io .. _poEdit: https://poedit.net/ .. _pylint: https://pylint.pycqa.org/en/latest/ .. _Roundup Source: diff --git a/roundup/pygettext.py b/roundup/pygettext.py new file mode 100644 --- /dev/null +++ b/roundup/pygettext.py @@ -0,0 +1,654 @@ +#! /usr/bin/env python +# -*- coding: iso-8859-1 -*- +# Originally written by Barry Warsaw +# +# Minimally patched to make it even more xgettext compatible +# by Peter Funk +# +# 2002-11-22 J�rgen Hermann +# Added checks that _() only contains string literals, and +# command line args are resolved to module lists, i.e. you +# can now pass a filename, a module or package name, or a +# directory (including globbing chars, important for Win32). +# Made docstring fit in 80 chars wide displays using pydoc. +# + +# for selftesting +try: + import fintl + _ = fintl.gettext +except ImportError: + _ = lambda s: s + +__doc__ = _("""pygettext -- Python equivalent of xgettext(1) + +Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the +internationalization of C programs. Most of these tools are independent of +the programming language and can be used from within Python programs. +Martin von Loewis' work[1] helps considerably in this regard. + +There's one problem though; xgettext is the program that scans source code +looking for message strings, but it groks only C (or C++). Python +introduces a few wrinkles, such as dual quoting characters, triple quoted +strings, and raw strings. xgettext understands none of this. + +Enter pygettext, which uses Python's standard tokenize module to scan +Python source code, generating .pot files identical to what GNU xgettext[2] +generates for C and C++ code. From there, the standard GNU tools can be +used. + +A word about marking Python strings as candidates for translation. GNU +xgettext recognizes the following keywords: gettext, dgettext, dcgettext, +and gettext_noop. But those can be a lot of text to include all over your +code. C and C++ have a trick: they use the C preprocessor. Most +internationalized C source includes a #define for gettext() to _() so that +what has to be written in the source is much less. Thus these are both +translatable strings: + + gettext("Translatable String") + _("Translatable String") + +Python of course has no preprocessor so this doesn't work so well. Thus, +pygettext searches only for _() by default, but see the -k/--keyword flag +below for how to augment this. + + [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html + [2] http://www.gnu.org/software/gettext/gettext.html + +NOTE: pygettext attempts to be option and feature compatible with GNU +xgettext where ever possible. However some options are still missing or are +not fully implemented. Also, xgettext's use of command line switches with +option arguments is broken, and in these cases, pygettext just defines +additional switches. + +Usage: pygettext [options] inputfile ... + +Options: + + -a + --extract-all + Extract all strings. + + -d name + --default-domain=name + Rename the default output file from messages.pot to name.pot. + + -E + --escape + Replace non-ASCII characters with octal escape sequences. + + -D + --docstrings + Extract module, class, method, and function docstrings. These do + not need to be wrapped in _() markers, and in fact cannot be for + Python to consider them docstrings. (See also the -X option). + + -h + --help + Print this help message and exit. + + -k word + --keyword=word + Keywords to look for in addition to the default set, which are: + %(DEFAULTKEYWORDS)s + + You can have multiple -k flags on the command line. + + -K + --no-default-keywords + Disable the default set of keywords (see above). Any keywords + explicitly added with the -k/--keyword option are still recognized. + + --no-location + Do not write filename/lineno location comments. + + -n + --add-location + Write filename/lineno location comments indicating where each + extracted string is found in the source. These lines appear before + each msgid. The style of comments is controlled by the -S/--style + option. This is the default. + + -o filename + --output=filename + Rename the default output file from messages.pot to filename. If + filename is `-' then the output is sent to standard out. + + -p dir + --output-dir=dir + Output files will be placed in directory dir. + + -S stylename + --style stylename + Specify which style to use for location comments. Two styles are + supported: + + Solaris # File: filename, line: line-number + GNU #: filename:line + + The style name is case insensitive. GNU style is the default. + + -v + --verbose + Print the names of the files being processed. + + -V + --version + Print the version of pygettext and exit. + + -w columns + --width=columns + Set width of output to columns. + + -x filename + --exclude-file=filename + Specify a file that contains a list of strings that are not be + extracted from the input files. Each string to be excluded must + appear on a line by itself in the file. + + -X filename + --no-docstrings=filename + Specify a file that contains a list of files (one per line) that + should not have their docstrings extracted. This is only useful in + conjunction with the -D option above. + +If `inputfile' is -, standard input is read. +""") + +import os +import imp +import sys +import glob +import time +import getopt +import token +import tokenize +import operator + +__version__ = '1.5' + +default_keywords = ['_'] +DEFAULTKEYWORDS = ', '.join(default_keywords) + +EMPTYSTRING = '' + +# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's +# there. +pot_header = _('''\ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR ORGANIZATION +# FIRST AUTHOR , YEAR. +# +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\\n" +"POT-Creation-Date: %(time)s\\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" +"Last-Translator: FULL NAME \\n" +"Language-Team: LANGUAGE \\n" +"MIME-Version: 1.0\\n" +"Content-Type: text/plain; charset=CHARSET\\n" +"Content-Transfer-Encoding: ENCODING\\n" +"Generated-By: pygettext.py %(version)s\\n" + +''') + +def usage(code, msg=''): + print >> sys.stderr, __doc__ % globals() + if msg: + print >> sys.stderr, msg + sys.exit(code) + + +escapes = [] + +def make_escapes(pass_iso8859): + global escapes + escapes = [chr(i) for i in range(256)] + if pass_iso8859: + # Allow iso-8859 characters to pass through so that e.g. 'msgid + # "H�he"' would result not result in 'msgid "H\366he"'. Otherwise we + # escape any character outside the 32..126 range. + mod = 128 + else: + mod = 256 + for i in range(mod): + if not(32 <= i <= 126): + escapes[i] = "\\%03o" % i + escapes[ord('\\')] = '\\\\' + escapes[ord('\t')] = '\\t' + escapes[ord('\r')] = '\\r' + escapes[ord('\n')] = '\\n' + escapes[ord('\"')] = '\\"' + + +def escape(s): + global escapes + s = list(s) + for i in range(len(s)): + s[i] = escapes[ord(s[i])] + return EMPTYSTRING.join(s) + + +def safe_eval(s): + # unwrap quotes, safely + return eval(s, {'__builtins__':{}}, {}) + + +def normalize(s): + # This converts the various Python string types into a format that is + # appropriate for .po files, namely much closer to C style. + lines = s.split('\n') + if len(lines) == 1: + s = '"' + escape(s) + '"' + else: + if not lines[-1]: + del lines[-1] + lines[-1] = lines[-1] + '\n' + for i in range(len(lines)): + lines[i] = escape(lines[i]) + lineterm = '\\n"\n"' + s = '""\n"' + lineterm.join(lines) + '"' + return s + +def containsAny(str, set): + """Check whether 'str' contains ANY of the chars in 'set'""" + return 1 in [c in str for c in set] + + +def _get_modpkg_path(dotted_name, pathlist=None): + """Get the filesystem path for a module or a package. + + Return the file system path to a file for a module, and to a directory for + a package. Return None if the name is not found, or is a builtin or + extension module. + """ + # split off top-most name + parts = dotted_name.split('.', 1) + + if len(parts) > 1: + # we have a dotted path, import top-level package + try: + file, pathname, description = imp.find_module(parts[0], pathlist) + if file: file.close() + except ImportError: + return None + + # check if it's indeed a package + if description[2] == imp.PKG_DIRECTORY: + # recursively handle the remaining name parts + pathname = _get_modpkg_path(parts[1], [pathname]) + else: + pathname = None + else: + # plain name + try: + file, pathname, description = imp.find_module( + dotted_name, pathlist) + if file: + file.close() + if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]: + pathname = None + except ImportError: + pathname = None + + return pathname + + +def getFilesForName(name): + """Get a list of module files for a filename, a module or package name, + or a directory. + """ + if not os.path.exists(name): + # check for glob chars + if containsAny(name, "*?[]"): + files = glob.glob(name) + list = [] + for file in files: + list.extend(getFilesForName(file)) + return list + + # try to find module or package + name = _get_modpkg_path(name) + if not name: + return [] + + if os.path.isdir(name): + # find all python files in directory + list = [] + # get extension for python source files + if '_py_ext' not in globals(): + global _py_ext + _py_ext = [triple[0] for triple in imp.get_suffixes() + if triple[2] == imp.PY_SOURCE][0] + for root, dirs, files in os.walk(name): + # don't recurse into CVS directories + if 'CVS' in dirs: + dirs.remove('CVS') + # add all *.py files to list + list.extend( + [os.path.join(root, file) for file in files + if os.path.splitext(file)[1] == _py_ext] + ) + return list + elif os.path.exists(name): + # a single file + return [name] + + return [] + +class TokenEater: + def __init__(self, options): + self.__options = options + self.__messages = {} + self.__state = self.__waiting + self.__data = [] + self.__lineno = -1 + self.__freshmodule = 1 + self.__curfile = None + + def __call__(self, ttype, tstring, stup, etup, line): + # dispatch +## import token +## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \ +## 'tstring:', tstring + self.__state(ttype, tstring, stup[0]) + + def __waiting(self, ttype, tstring, lineno): + opts = self.__options + # Do docstring extractions, if enabled + if opts.docstrings and not opts.nodocstrings.get(self.__curfile): + # module docstring? + if self.__freshmodule: + if ttype == tokenize.STRING: + self.__addentry(safe_eval(tstring), lineno, isdocstring=1) + self.__freshmodule = 0 + elif ttype not in (tokenize.COMMENT, tokenize.NL): + self.__freshmodule = 0 + return + # class docstring? + if ttype == tokenize.NAME and tstring in ('class', 'def'): + self.__state = self.__suiteseen + return + if ttype == tokenize.NAME and tstring in opts.keywords: + self.__state = self.__keywordseen + + def __suiteseen(self, ttype, tstring, lineno): + # ignore anything until we see the colon + if ttype == tokenize.OP and tstring == ':': + self.__state = self.__suitedocstring + + def __suitedocstring(self, ttype, tstring, lineno): + # ignore any intervening noise + if ttype == tokenize.STRING: + self.__addentry(safe_eval(tstring), lineno, isdocstring=1) + self.__state = self.__waiting + elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, + tokenize.COMMENT): + # there was no class docstring + self.__state = self.__waiting + + def __keywordseen(self, ttype, tstring, lineno): + if ttype == tokenize.OP and tstring == '(': + self.__data = [] + self.__lineno = lineno + self.__state = self.__openseen + else: + self.__state = self.__waiting + + def __openseen(self, ttype, tstring, lineno): + if ttype == tokenize.OP and tstring == ')': + # We've seen the last of the translatable strings. Record the + # line number of the first line of the strings and update the list + # of messages seen. Reset state for the next batch. If there + # were no strings inside _(), then just ignore this entry. + if self.__data: + self.__addentry(EMPTYSTRING.join(self.__data)) + self.__state = self.__waiting + elif ttype == tokenize.STRING: + self.__data.append(safe_eval(tstring)) + elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, + token.NEWLINE, tokenize.NL]: + # warn if we see anything else than STRING or whitespace + print >> sys.stderr, _( + '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' + ) % { + 'token': tstring, + 'file': self.__curfile, + 'lineno': self.__lineno + } + self.__state = self.__waiting + + def __addentry(self, msg, lineno=None, isdocstring=0): + if lineno is None: + lineno = self.__lineno + if not msg in self.__options.toexclude: + entry = (self.__curfile, lineno) + self.__messages.setdefault(msg, {})[entry] = isdocstring + + def set_filename(self, filename): + self.__curfile = filename + self.__freshmodule = 1 + + def write(self, fp): + options = self.__options + timestamp = time.strftime('%Y-%m-%d %H:%M+%Z') + # The time stamp in the header doesn't have the same format as that + # generated by xgettext... + print >> fp, pot_header % {'time': timestamp, 'version': __version__} + # Sort the entries. First sort each particular entry's keys, then + # sort all the entries by their first item. + reverse = {} + for k, v in self.__messages.items(): + keys = v.keys() + keys.sort() + reverse.setdefault(tuple(keys), []).append((k, v)) + rkeys = reverse.keys() + rkeys.sort() + for rkey in rkeys: + rentries = reverse[rkey] + rentries.sort() + for k, v in rentries: + isdocstring = 0 + # If the entry was gleaned out of a docstring, then add a + # comment stating so. This is to aid translators who may wish + # to skip translating some unimportant docstrings. + if reduce(operator.__add__, v.values()): + isdocstring = 1 + # k is the message string, v is a dictionary-set of (filename, + # lineno) tuples. We want to sort the entries in v first by + # file name and then by line number. + v = v.keys() + v.sort() + if not options.writelocations: + pass + # location comments are different b/w Solaris and GNU: + elif options.locationstyle == options.SOLARIS: + for filename, lineno in v: + d = {'filename': filename, 'lineno': lineno} + print >>fp, _( + '# File: %(filename)s, line: %(lineno)d') % d + elif options.locationstyle == options.GNU: + # fit as many locations on one line, as long as the + # resulting line length doesn't exceed 'options.width' + locline = '#:' + for filename, lineno in v: + d = {'filename': filename, 'lineno': lineno} + s = _(' %(filename)s:%(lineno)d') % d + if len(locline) + len(s) <= options.width: + locline = locline + s + else: + print >> fp, locline + locline = "#:" + s + if len(locline) > 2: + print >> fp, locline + if isdocstring: + print >> fp, '#, docstring' + print >> fp, 'msgid', normalize(k) + print >> fp, 'msgstr ""\n' + + +def main(): + global default_keywords + try: + opts, args = getopt.getopt( + sys.argv[1:], + 'ad:DEhk:Kno:p:S:Vvw:x:X:', + ['extract-all', 'default-domain=', 'escape', 'help', + 'keyword=', 'no-default-keywords', + 'add-location', 'no-location', 'output=', 'output-dir=', + 'style=', 'verbose', 'version', 'width=', 'exclude-file=', + 'docstrings', 'no-docstrings', + ]) + except getopt.error, msg: + usage(1, msg) + + # for holding option values + class Options: + # constants + GNU = 1 + SOLARIS = 2 + # defaults + extractall = 0 # FIXME: currently this option has no effect at all. + escape = 0 + keywords = [] + outpath = '' + outfile = 'messages.pot' + writelocations = 1 + locationstyle = GNU + verbose = 0 + width = 78 + excludefilename = '' + docstrings = 0 + nodocstrings = {} + + options = Options() + locations = {'gnu' : options.GNU, + 'solaris' : options.SOLARIS, + } + + # parse options + for opt, arg in opts: + if opt in ('-h', '--help'): + usage(0) + elif opt in ('-a', '--extract-all'): + options.extractall = 1 + elif opt in ('-d', '--default-domain'): + options.outfile = arg + '.pot' + elif opt in ('-E', '--escape'): + options.escape = 1 + elif opt in ('-D', '--docstrings'): + options.docstrings = 1 + elif opt in ('-k', '--keyword'): + options.keywords.append(arg) + elif opt in ('-K', '--no-default-keywords'): + default_keywords = [] + elif opt in ('-n', '--add-location'): + options.writelocations = 1 + elif opt in ('--no-location',): + options.writelocations = 0 + elif opt in ('-S', '--style'): + options.locationstyle = locations.get(arg.lower()) + if options.locationstyle is None: + usage(1, _('Invalid value for --style: %s') % arg) + elif opt in ('-o', '--output'): + options.outfile = arg + elif opt in ('-p', '--output-dir'): + options.outpath = arg + elif opt in ('-v', '--verbose'): + options.verbose = 1 + elif opt in ('-V', '--version'): + print _('pygettext.py (xgettext for Python) %s') % __version__ + sys.exit(0) + elif opt in ('-w', '--width'): + try: + options.width = int(arg) + except ValueError: + usage(1, _('--width argument must be an integer: %s') % arg) + elif opt in ('-x', '--exclude-file'): + options.excludefilename = arg + elif opt in ('-X', '--no-docstrings'): + fp = open(arg) + try: + while 1: + line = fp.readline() + if not line: + break + options.nodocstrings[line[:-1]] = 1 + finally: + fp.close() + + # calculate escapes + make_escapes(not options.escape) + + # calculate all keywords + options.keywords.extend(default_keywords) + + # initialize list of strings to exclude + if options.excludefilename: + try: + fp = open(options.excludefilename) + options.toexclude = fp.readlines() + fp.close() + except IOError: + print >> sys.stderr, _( + "Can't read --exclude-file: %s") % options.excludefilename + sys.exit(1) + else: + options.toexclude = [] + + # resolve args to module lists + expanded = [] + for arg in args: + if arg == '-': + expanded.append(arg) + else: + expanded.extend(getFilesForName(arg)) + args = expanded + + # slurp through all the files + eater = TokenEater(options) + for filename in args: + if filename == '-': + if options.verbose: + print _('Reading standard input') + fp = sys.stdin + closep = 0 + else: + if options.verbose: + print _('Working on %s') % filename + fp = open(filename) + closep = 1 + try: + eater.set_filename(filename) + try: + tokenize.tokenize(fp.readline, eater) + except tokenize.TokenError, e: + print >> sys.stderr, '%s: %s, line %d, column %d' % ( + e[0], filename, e[1][0], e[1][1]) + finally: + if closep: + fp.close() + + # write the output + if options.outfile == '-': + fp = sys.stdout + closep = 0 + else: + if options.outpath: + options.outfile = os.path.join(options.outpath, options.outfile) + fp = open(options.outfile, 'w') + closep = 1 + try: + eater.write(fp) + finally: + if closep: + fp.close() + +if __name__ == '__main__': + main() + # some more test strings + _(u'a unicode string') + # this one creates a warning + _('*** Seen unexpected token "%(token)s"') % {'token': 'test'} + _('more' 'than' 'one' 'string') diff --git a/roundup/scripts/roundup_gettext.py b/roundup/scripts/roundup_gettext.py --- a/roundup/scripts/roundup_gettext.py +++ b/roundup/scripts/roundup_gettext.py @@ -2,7 +2,7 @@ # # Copyright 2004 Richard Jones (richard@mechanicalcat.net) -"""Extract translatable strings from tracker templates""" +"""Extract translatable strings from tracker templates and detectors/extensions""" from __future__ import print_function import os @@ -23,6 +23,30 @@ from roundup.i18n import _ from roundup.cgi.TAL import talgettext +from roundup.pygettext import make_escapes, TokenEater, tokenize +import polib, tempfile + +# from pygettext's main(): +class Options: + # constants + GNU = 1 + SOLARIS = 2 + # defaults + extractall = 0 # FIXME: currently this option has no effect at all. + escape = 0 + keywords = ["_", "gettext", "ngettext", "ugettext"] + outpath = '' + outfile = '' + writelocations = 1 + locationstyle = GNU + verbose = 0 + width = 78 + excludefilename = '' + docstrings = 0 + nodocstrings = {} + toexclude = [] # TODO we should exclude all strings already found in some template + +tokeneater_options = Options() # name of message template file. # i don't think this will ever need to be changed, but still... @@ -62,6 +86,45 @@ # run talgettext.main() + # we have now everything from the templates in the TEMPLATE_FILE + # now we search in home/detectors and home/extensions *.py files for + # tokeneater_options.keywords + # this is partly assembled from pygettext's main() + make_escapes(not tokeneater_options.escape) + + pyfiles = [] + for source in ["detectors", "extensions"] : + for root, dirs, files in os.walk (os.path.join ("..", source)) : + pyfiles.extend ([os.path.join (root, f) for f in files if f.endswith (".py")]) + + eater = TokenEater (tokeneater_options) + + for filename in pyfiles : + eater.set_filename (filename) + with open (filename, "r") as f: + try : + tokenize.tokenize(f.readline, eater) + except tokenize.TokenError, e: + print >> sys.stderr, '%s: %s, line %d, column %d' % ( + e[0], filename, e[1][0], e[1][1]) + + with tempfile.NamedTemporaryFile("w") as tf : + eater.write (tf) + tf.seek (0) + p1 = polib.pofile(TEMPLATE_FILE) + p2 = polib.pofile(tf.name) + + p2_msg_ids = set([e.msgid for e in p2]) + for e in p1 : + if e.msgid in p2_msg_ids : + p2_e = p2.find (e.msgid) + e.occurrences.extend (p2_e.occurrences) + p2_msg_ids.remove (e.msgid) + + for msgid in p2_msg_ids : + p1.append (p2.find (msgid)) + + p1.save () if __name__ == "__main__": run()