| #! /usr/bin/env python | |
| # -*- coding: iso-8859-1 -*- | |
| # Originally written by Barry Warsaw <barry@zope.com> | |
| # | |
| # Minimally patched to make it even more xgettext compatible | |
| # by Peter Funk <pf@artcom-gmbh.de> | |
| # | |
| # 2002-11-22 Jürgen Hermann <jh@web.de> | |
| # Added checks that _() only contains string literals, and | |
| # command line args are resolved to module lists, i.e. you | |
| # can now pass a filename, a module or package name, or a | |
| # directory (including globbing chars, important for Win32). | |
| # Made docstring fit in 80 chars wide displays using pydoc. | |
| # | |
| # for selftesting | |
| try: | |
| import fintl | |
| _ = fintl.gettext | |
| except ImportError: | |
| _ = lambda s: s | |
| __doc__ = _("""pygettext -- Python equivalent of xgettext(1) | |
| Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the | |
| internationalization of C programs. Most of these tools are independent of | |
| the programming language and can be used from within Python programs. | |
| Martin von Loewis' work[1] helps considerably in this regard. | |
| There's one problem though; xgettext is the program that scans source code | |
| looking for message strings, but it groks only C (or C++). Python | |
| introduces a few wrinkles, such as dual quoting characters, triple quoted | |
| strings, and raw strings. xgettext understands none of this. | |
| Enter pygettext, which uses Python's standard tokenize module to scan | |
| Python source code, generating .pot files identical to what GNU xgettext[2] | |
| generates for C and C++ code. From there, the standard GNU tools can be | |
| used. | |
| A word about marking Python strings as candidates for translation. GNU | |
| xgettext recognizes the following keywords: gettext, dgettext, dcgettext, | |
| and gettext_noop. But those can be a lot of text to include all over your | |
| code. C and C++ have a trick: they use the C preprocessor. Most | |
| internationalized C source includes a #define for gettext() to _() so that | |
| what has to be written in the source is much less. Thus these are both | |
| translatable strings: | |
| gettext("Translatable String") | |
| _("Translatable String") | |
| Python of course has no preprocessor so this doesn't work so well. Thus, | |
| pygettext searches only for _() by default, but see the -k/--keyword flag | |
| below for how to augment this. | |
| [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html | |
| [2] http://www.gnu.org/software/gettext/gettext.html | |
| NOTE: pygettext attempts to be option and feature compatible with GNU | |
| xgettext where ever possible. However some options are still missing or are | |
| not fully implemented. Also, xgettext's use of command line switches with | |
| option arguments is broken, and in these cases, pygettext just defines | |
| additional switches. | |
| Usage: pygettext [options] inputfile ... | |
| Options: | |
| -a | |
| --extract-all | |
| Extract all strings. | |
| -d name | |
| --default-domain=name | |
| Rename the default output file from messages.pot to name.pot. | |
| -E | |
| --escape | |
| Replace non-ASCII characters with octal escape sequences. | |
| -D | |
| --docstrings | |
| Extract module, class, method, and function docstrings. These do | |
| not need to be wrapped in _() markers, and in fact cannot be for | |
| Python to consider them docstrings. (See also the -X option). | |
| -h | |
| --help | |
| Print this help message and exit. | |
| -k word | |
| --keyword=word | |
| Keywords to look for in addition to the default set, which are: | |
| %(DEFAULTKEYWORDS)s | |
| You can have multiple -k flags on the command line. | |
| -K | |
| --no-default-keywords | |
| Disable the default set of keywords (see above). Any keywords | |
| explicitly added with the -k/--keyword option are still recognized. | |
| --no-location | |
| Do not write filename/lineno location comments. | |
| -n | |
| --add-location | |
| Write filename/lineno location comments indicating where each | |
| extracted string is found in the source. These lines appear before | |
| each msgid. The style of comments is controlled by the -S/--style | |
| option. This is the default. | |
| -o filename | |
| --output=filename | |
| Rename the default output file from messages.pot to filename. If | |
| filename is `-' then the output is sent to standard out. | |
| -p dir | |
| --output-dir=dir | |
| Output files will be placed in directory dir. | |
| -S stylename | |
| --style stylename | |
| Specify which style to use for location comments. Two styles are | |
| supported: | |
| Solaris # File: filename, line: line-number | |
| GNU #: filename:line | |
| The style name is case insensitive. GNU style is the default. | |
| -v | |
| --verbose | |
| Print the names of the files being processed. | |
| -V | |
| --version | |
| Print the version of pygettext and exit. | |
| -w columns | |
| --width=columns | |
| Set width of output to columns. | |
| -x filename | |
| --exclude-file=filename | |
| Specify a file that contains a list of strings that are not be | |
| extracted from the input files. Each string to be excluded must | |
| appear on a line by itself in the file. | |
| -X filename | |
| --no-docstrings=filename | |
| Specify a file that contains a list of files (one per line) that | |
| should not have their docstrings extracted. This is only useful in | |
| conjunction with the -D option above. | |
| If `inputfile' is -, standard input is read. | |
| """) | |
| import os | |
| import imp | |
| import sys | |
| import glob | |
| import time | |
| import getopt | |
| import token | |
| import tokenize | |
| import operator | |
| __version__ = '1.5' | |
| default_keywords = ['_'] | |
| DEFAULTKEYWORDS = ', '.join(default_keywords) | |
| EMPTYSTRING = '' | |
| # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's | |
| # there. | |
| pot_header = _('''\ | |
| # SOME DESCRIPTIVE TITLE. | |
| # Copyright (C) YEAR ORGANIZATION | |
| # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. | |
| # | |
| msgid "" | |
| msgstr "" | |
| "Project-Id-Version: PACKAGE VERSION\\n" | |
| "POT-Creation-Date: %(time)s\\n" | |
| "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" | |
| "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" | |
| "Language-Team: LANGUAGE <LL@li.org>\\n" | |
| "MIME-Version: 1.0\\n" | |
| "Content-Type: text/plain; charset=CHARSET\\n" | |
| "Content-Transfer-Encoding: ENCODING\\n" | |
| "Generated-By: pygettext.py %(version)s\\n" | |
| ''') | |
| def usage(code, msg=''): | |
| print >> sys.stderr, __doc__ % globals() | |
| if msg: | |
| print >> sys.stderr, msg | |
| sys.exit(code) | |
| escapes = [] | |
| def make_escapes(pass_iso8859): | |
| global escapes | |
| if pass_iso8859: | |
| # Allow iso-8859 characters to pass through so that e.g. 'msgid | |
| # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we | |
| # escape any character outside the 32..126 range. | |
| mod = 128 | |
| else: | |
| mod = 256 | |
| for i in range(256): | |
| if 32 <= (i % mod) <= 126: | |
| escapes.append(chr(i)) | |
| else: | |
| escapes.append("\\%03o" % i) | |
| escapes[ord('\\')] = '\\\\' | |
| escapes[ord('\t')] = '\\t' | |
| escapes[ord('\r')] = '\\r' | |
| escapes[ord('\n')] = '\\n' | |
| escapes[ord('\"')] = '\\"' | |
| def escape(s): | |
| global escapes | |
| s = list(s) | |
| for i in range(len(s)): | |
| s[i] = escapes[ord(s[i])] | |
| return EMPTYSTRING.join(s) | |
| def safe_eval(s): | |
| # unwrap quotes, safely | |
| return eval(s, {'__builtins__':{}}, {}) | |
| def normalize(s): | |
| # This converts the various Python string types into a format that is | |
| # appropriate for .po files, namely much closer to C style. | |
| lines = s.split('\n') | |
| if len(lines) == 1: | |
| s = '"' + escape(s) + '"' | |
| else: | |
| if not lines[-1]: | |
| del lines[-1] | |
| lines[-1] = lines[-1] + '\n' | |
| for i in range(len(lines)): | |
| lines[i] = escape(lines[i]) | |
| lineterm = '\\n"\n"' | |
| s = '""\n"' + lineterm.join(lines) + '"' | |
| return s | |
| def containsAny(str, set): | |
| """Check whether 'str' contains ANY of the chars in 'set'""" | |
| return 1 in [c in str for c in set] | |
| def _visit_pyfiles(list, dirname, names): | |
| """Helper for getFilesForName().""" | |
| # get extension for python source files | |
| if not globals().has_key('_py_ext'): | |
| global _py_ext | |
| _py_ext = [triple[0] for triple in imp.get_suffixes() | |
| if triple[2] == imp.PY_SOURCE][0] | |
| # don't recurse into CVS directories | |
| if 'CVS' in names: | |
| names.remove('CVS') | |
| # add all *.py files to list | |
| list.extend( | |
| [os.path.join(dirname, file) for file in names | |
| if os.path.splitext(file)[1] == _py_ext] | |
| ) | |
| def _get_modpkg_path(dotted_name, pathlist=None): | |
| """Get the filesystem path for a module or a package. | |
| Return the file system path to a file for a module, and to a directory for | |
| a package. Return None if the name is not found, or is a builtin or | |
| extension module. | |
| """ | |
| # split off top-most name | |
| parts = dotted_name.split('.', 1) | |
| if len(parts) > 1: | |
| # we have a dotted path, import top-level package | |
| try: | |
| file, pathname, description = imp.find_module(parts[0], pathlist) | |
| if file: file.close() | |
| except ImportError: | |
| return None | |
| # check if it's indeed a package | |
| if description[2] == imp.PKG_DIRECTORY: | |
| # recursively handle the remaining name parts | |
| pathname = _get_modpkg_path(parts[1], [pathname]) | |
| else: | |
| pathname = None | |
| else: | |
| # plain name | |
| try: | |
| file, pathname, description = imp.find_module( | |
| dotted_name, pathlist) | |
| if file: | |
| file.close() | |
| if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]: | |
| pathname = None | |
| except ImportError: | |
| pathname = None | |
| return pathname | |
| def getFilesForName(name): | |
| """Get a list of module files for a filename, a module or package name, | |
| or a directory. | |
| """ | |
| if not os.path.exists(name): | |
| # check for glob chars | |
| if containsAny(name, "*?[]"): | |
| files = glob.glob(name) | |
| list = [] | |
| for file in files: | |
| list.extend(getFilesForName(file)) | |
| return list | |
| # try to find module or package | |
| name = _get_modpkg_path(name) | |
| if not name: | |
| return [] | |
| if os.path.isdir(name): | |
| # find all python files in directory | |
| list = [] | |
| os.path.walk(name, _visit_pyfiles, list) | |
| return list | |
| elif os.path.exists(name): | |
| # a single file | |
| return [name] | |
| return [] | |
| class TokenEater: | |
| def __init__(self, options): | |
| self.__options = options | |
| self.__messages = {} | |
| self.__state = self.__waiting | |
| self.__data = [] | |
| self.__lineno = -1 | |
| self.__freshmodule = 1 | |
| self.__curfile = None | |
| def __call__(self, ttype, tstring, stup, etup, line): | |
| # dispatch | |
| ## import token | |
| ## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \ | |
| ## 'tstring:', tstring | |
| self.__state(ttype, tstring, stup[0]) | |
| def __waiting(self, ttype, tstring, lineno): | |
| opts = self.__options | |
| # Do docstring extractions, if enabled | |
| if opts.docstrings and not opts.nodocstrings.get(self.__curfile): | |
| # module docstring? | |
| if self.__freshmodule: | |
| if ttype == tokenize.STRING: | |
| self.__addentry(safe_eval(tstring), lineno, isdocstring=1) | |
| self.__freshmodule = 0 | |
| elif ttype not in (tokenize.COMMENT, tokenize.NL): | |
| self.__freshmodule = 0 | |
| return | |
| # class docstring? | |
| if ttype == tokenize.NAME and tstring in ('class', 'def'): | |
| self.__state = self.__suiteseen | |
| return | |
| if ttype == tokenize.NAME and tstring in opts.keywords: | |
| self.__state = self.__keywordseen | |
| def __suiteseen(self, ttype, tstring, lineno): | |
| # ignore anything until we see the colon | |
| if ttype == tokenize.OP and tstring == ':': | |
| self.__state = self.__suitedocstring | |
| def __suitedocstring(self, ttype, tstring, lineno): | |
| # ignore any intervening noise | |
| if ttype == tokenize.STRING: | |
| self.__addentry(safe_eval(tstring), lineno, isdocstring=1) | |
| self.__state = self.__waiting | |
| elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, | |
| tokenize.COMMENT): | |
| # there was no class docstring | |
| self.__state = self.__waiting | |
| def __keywordseen(self, ttype, tstring, lineno): | |
| if ttype == tokenize.OP and tstring == '(': | |
| self.__data = [] | |
| self.__lineno = lineno | |
| self.__state = self.__openseen | |
| else: | |
| self.__state = self.__waiting | |
| def __openseen(self, ttype, tstring, lineno): | |
| if ttype == tokenize.OP and tstring == ')': | |
| # We've seen the last of the translatable strings. Record the | |
| # line number of the first line of the strings and update the list | |
| # of messages seen. Reset state for the next batch. If there | |
| # were no strings inside _(), then just ignore this entry. | |
| if self.__data: | |
| self.__addentry(EMPTYSTRING.join(self.__data)) | |
| self.__state = self.__waiting | |
| elif ttype == tokenize.STRING: | |
| self.__data.append(safe_eval(tstring)) | |
| elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, | |
| token.NEWLINE, tokenize.NL]: | |
| # warn if we see anything else than STRING or whitespace | |
| print >> sys.stderr, _( | |
| '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' | |
| ) % { | |
| 'token': tstring, | |
| 'file': self.__curfile, | |
| 'lineno': self.__lineno | |
| } | |
| self.__state = self.__waiting | |
| def __addentry(self, msg, lineno=None, isdocstring=0): | |
| if lineno is None: | |
| lineno = self.__lineno | |
| if not msg in self.__options.toexclude: | |
| entry = (self.__curfile, lineno) | |
| self.__messages.setdefault(msg, {})[entry] = isdocstring | |
| def set_filename(self, filename): | |
| self.__curfile = filename | |
| self.__freshmodule = 1 | |
| def write(self, fp): | |
| options = self.__options | |
| timestamp = time.strftime('%Y-%m-%d %H:%M+%Z') | |
| # The time stamp in the header doesn't have the same format as that | |
| # generated by xgettext... | |
| print >> fp, pot_header % {'time': timestamp, 'version': __version__} | |
| # Sort the entries. First sort each particular entry's keys, then | |
| # sort all the entries by their first item. | |
| reverse = {} | |
| for k, v in self.__messages.items(): | |
| keys = v.keys() | |
| keys.sort() | |
| reverse.setdefault(tuple(keys), []).append((k, v)) | |
| rkeys = reverse.keys() | |
| rkeys.sort() | |
| for rkey in rkeys: | |
| rentries = reverse[rkey] | |
| rentries.sort() | |
| for k, v in rentries: | |
| isdocstring = 0 | |
| # If the entry was gleaned out of a docstring, then add a | |
| # comment stating so. This is to aid translators who may wish | |
| # to skip translating some unimportant docstrings. | |
| if reduce(operator.__add__, v.values()): | |
| isdocstring = 1 | |
| # k is the message string, v is a dictionary-set of (filename, | |
| # lineno) tuples. We want to sort the entries in v first by | |
| # file name and then by line number. | |
| v = v.keys() | |
| v.sort() | |
| if not options.writelocations: | |
| pass | |
| # location comments are different b/w Solaris and GNU: | |
| elif options.locationstyle == options.SOLARIS: | |
| for filename, lineno in v: | |
| d = {'filename': filename, 'lineno': lineno} | |
| print >>fp, _( | |
| '# File: %(filename)s, line: %(lineno)d') % d | |
| elif options.locationstyle == options.GNU: | |
| # fit as many locations on one line, as long as the | |
| # resulting line length doesn't exceeds 'options.width' | |
| locline = '#:' | |
| for filename, lineno in v: | |
| d = {'filename': filename, 'lineno': lineno} | |
| s = _(' %(filename)s:%(lineno)d') % d | |
| if len(locline) + len(s) <= options.width: | |
| locline = locline + s | |
| else: | |
| print >> fp, locline | |
| locline = "#:" + s | |
| if len(locline) > 2: | |
| print >> fp, locline | |
| if isdocstring: | |
| print >> fp, '#, docstring' | |
| print >> fp, 'msgid', normalize(k) | |
| print >> fp, 'msgstr ""\n' | |
| def main(): | |
| global default_keywords | |
| try: | |
| opts, args = getopt.getopt( | |
| sys.argv[1:], | |
| 'ad:DEhk:Kno:p:S:Vvw:x:X:', | |
| ['extract-all', 'default-domain=', 'escape', 'help', | |
| 'keyword=', 'no-default-keywords', | |
| 'add-location', 'no-location', 'output=', 'output-dir=', | |
| 'style=', 'verbose', 'version', 'width=', 'exclude-file=', | |
| 'docstrings', 'no-docstrings', | |
| ]) | |
| except getopt.error, msg: | |
| usage(1, msg) | |
| # for holding option values | |
| class Options: | |
| # constants | |
| GNU = 1 | |
| SOLARIS = 2 | |
| # defaults | |
| extractall = 0 # FIXME: currently this option has no effect at all. | |
| escape = 0 | |
| keywords = [] | |
| outpath = '' | |
| outfile = 'messages.pot' | |
| writelocations = 1 | |
| locationstyle = GNU | |
| verbose = 0 | |
| width = 78 | |
| excludefilename = '' | |
| docstrings = 0 | |
| nodocstrings = {} | |
| options = Options() | |
| locations = {'gnu' : options.GNU, | |
| 'solaris' : options.SOLARIS, | |
| } | |
| # parse options | |
| for opt, arg in opts: | |
| if opt in ('-h', '--help'): | |
| usage(0) | |
| elif opt in ('-a', '--extract-all'): | |
| options.extractall = 1 | |
| elif opt in ('-d', '--default-domain'): | |
| options.outfile = arg + '.pot' | |
| elif opt in ('-E', '--escape'): | |
| options.escape = 1 | |
| elif opt in ('-D', '--docstrings'): | |
| options.docstrings = 1 | |
| elif opt in ('-k', '--keyword'): | |
| options.keywords.append(arg) | |
| elif opt in ('-K', '--no-default-keywords'): | |
| default_keywords = [] | |
| elif opt in ('-n', '--add-location'): | |
| options.writelocations = 1 | |
| elif opt in ('--no-location',): | |
| options.writelocations = 0 | |
| elif opt in ('-S', '--style'): | |
| options.locationstyle = locations.get(arg.lower()) | |
| if options.locationstyle is None: | |
| usage(1, _('Invalid value for --style: %s') % arg) | |
| elif opt in ('-o', '--output'): | |
| options.outfile = arg | |
| elif opt in ('-p', '--output-dir'): | |
| options.outpath = arg | |
| elif opt in ('-v', '--verbose'): | |
| options.verbose = 1 | |
| elif opt in ('-V', '--version'): | |
| print _('pygettext.py (xgettext for Python) %s') % __version__ | |
| sys.exit(0) | |
| elif opt in ('-w', '--width'): | |
| try: | |
| options.width = int(arg) | |
| except ValueError: | |
| usage(1, _('--width argument must be an integer: %s') % arg) | |
| elif opt in ('-x', '--exclude-file'): | |
| options.excludefilename = arg | |
| elif opt in ('-X', '--no-docstrings'): | |
| fp = open(arg) | |
| try: | |
| while 1: | |
| line = fp.readline() | |
| if not line: | |
| break | |
| options.nodocstrings[line[:-1]] = 1 | |
| finally: | |
| fp.close() | |
| # calculate escapes | |
| make_escapes(options.escape) | |
| # calculate all keywords | |
| options.keywords.extend(default_keywords) | |
| # initialize list of strings to exclude | |
| if options.excludefilename: | |
| try: | |
| fp = open(options.excludefilename) | |
| options.toexclude = fp.readlines() | |
| fp.close() | |
| except IOError: | |
| print >> sys.stderr, _( | |
| "Can't read --exclude-file: %s") % options.excludefilename | |
| sys.exit(1) | |
| else: | |
| options.toexclude = [] | |
| # resolve args to module lists | |
| expanded = [] | |
| for arg in args: | |
| if arg == '-': | |
| expanded.append(arg) | |
| else: | |
| expanded.extend(getFilesForName(arg)) | |
| args = expanded | |
| # slurp through all the files | |
| eater = TokenEater(options) | |
| for filename in args: | |
| if filename == '-': | |
| if options.verbose: | |
| print _('Reading standard input') | |
| fp = sys.stdin | |
| closep = 0 | |
| else: | |
| if options.verbose: | |
| print _('Working on %s') % filename | |
| fp = open(filename) | |
| closep = 1 | |
| try: | |
| eater.set_filename(filename) | |
| try: | |
| tokenize.tokenize(fp.readline, eater) | |
| except tokenize.TokenError, e: | |
| print >> sys.stderr, '%s: %s, line %d, column %d' % ( | |
| e[0], filename, e[1][0], e[1][1]) | |
| finally: | |
| if closep: | |
| fp.close() | |
| # write the output | |
| if options.outfile == '-': | |
| fp = sys.stdout | |
| closep = 0 | |
| else: | |
| if options.outpath: | |
| options.outfile = os.path.join(options.outpath, options.outfile) | |
| fp = open(options.outfile, 'w') | |
| closep = 1 | |
| try: | |
| eater.write(fp) | |
| finally: | |
| if closep: | |
| fp.close() | |
| if __name__ == '__main__': | |
| main() | |
| # some more test strings | |
| _(u'a unicode string') | |
| # this one creates a warning | |
| _('*** Seen unexpected token "%(token)s"') % {'token': 'test'} | |
| _('more' 'than' 'one' 'string') |