From dbaccf70ace2bf6eb9fcac00c864bbeaec72cd13 Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Mon, 18 Jun 2018 00:50:47 +0000 Subject: [PATCH 42/54] Python 3 preparation: unicode. --- roundup/anypy/strings.py | 75 ++++++++++++++++++++++++++++++++++++++ roundup/backends/back_sqlite.py | 3 +- roundup/backends/indexer_rdbms.py | 6 +-- roundup/backends/indexer_whoosh.py | 4 +- roundup/backends/rdbms_common.py | 4 +- roundup/cgi/PageTemplates/TALES.py | 2 +- roundup/cgi/TranslationService.py | 19 ++++------ roundup/cgi/engine_chameleon.py | 3 +- roundup/cgi/engine_jinja2.py | 4 +- roundup/cgi/templating.py | 18 ++++----- roundup/configuration.py | 3 +- roundup/dehtml.py | 3 +- roundup/i18n.py | 3 +- roundup/mailer.py | 5 ++- roundup/mailgw.py | 3 +- roundup/password.py | 11 +++--- roundup/roundupdb.py | 6 ++- roundup/xmlrpc.py | 10 ++--- scripts/import_sf.py | 11 +++--- 19 files changed, 134 insertions(+), 59 deletions(-) create mode 100644 roundup/anypy/strings.py diff --git a/roundup/anypy/strings.py b/roundup/anypy/strings.py new file mode 100644 index 0000000..2429b3b --- /dev/null +++ b/roundup/anypy/strings.py @@ -0,0 +1,75 @@ +# Roundup represents text internally using the native Python str type. +# In Python 3, these are Unicode strings. In Python 2, these are +# encoded using UTF-8, and the Python 2 unicode type is only used in a +# few places, generally for interacting with external modules +# requiring that type to be used. + +import sys +_py3 = sys.version_info[0] > 2 + +def b2s(b): + """Convert a UTF-8 encoded bytes object to the internal string format.""" + if _py3: + return b.decode('utf-8') + else: + return b + +def s2b(s): + """Convert a string object to UTF-8 encoded bytes.""" + if _py3: + return s.encode('utf-8') + else: + return s + +def s2u(s, errors='strict'): + """Convert a string object to a Unicode string.""" + if _py3: + return s + else: + return unicode(s, 'utf-8', errors) + +def u2s(u): + """Convert a Unicode string to the internal string format.""" + if _py3: + return u + else: + return u.encode('utf-8') + +def us2u(s, errors='strict'): + """Convert a string or Unicode string to a Unicode string.""" + if _py3: + return s + else: + if isinstance(s, unicode): + return s + else: + return unicode(s, 'utf-8', errors) + +def us2s(u): + """Convert a string or Unicode string to the internal string format.""" + if _py3: + return u + else: + if isinstance(u, unicode): + return u.encode('utf-8') + else: + return u + +def uany2s(u): + """Convert a Unicode string or other object to the internal string format. + + Objects that are not Unicode strings are passed to str().""" + if _py3: + return str(u) + else: + if isinstance(u, unicode): + return u.encode('utf-8') + else: + return str(u) + +def is_us(s): + """Return whether an object is a string or Unicode string.""" + if _py3: + return isinstance(s, str) + else: + return isinstance(s, str) or isinstance(s, unicode) diff --git a/roundup/backends/back_sqlite.py b/roundup/backends/back_sqlite.py index 002f0ba..214286b 100644 --- a/roundup/backends/back_sqlite.py +++ b/roundup/backends/back_sqlite.py @@ -13,6 +13,7 @@ import os, base64, marshal, shutil, time, logging from roundup import hyperdb, date, password from roundup.backends import rdbms_common from roundup.backends.sessions_dbm import Sessions, OneTimeKeys +from roundup.anypy.strings import uany2s sqlite_version = None try: @@ -85,7 +86,7 @@ class Database(rdbms_common.Database): hyperdb.Multilink : lambda x: x, # used in journal marshalling } sql_to_hyperdb_value = { - hyperdb.String : lambda x: isinstance(x, unicode) and x.encode('utf8') or str(x), + hyperdb.String : uany2s, hyperdb.Date : lambda x: date.Date(str(x)), hyperdb.Link : str, # XXX numeric ids hyperdb.Interval : date.Interval, diff --git a/roundup/backends/indexer_rdbms.py b/roundup/backends/indexer_rdbms.py index 6b99035..a6c7392 100644 --- a/roundup/backends/indexer_rdbms.py +++ b/roundup/backends/indexer_rdbms.py @@ -5,6 +5,7 @@ propname, itemid) instances. import re from roundup.backends.indexer_common import Indexer as IndexerBase +from roundup.anypy.strings import us2u, u2s class Indexer(IndexerBase): def __init__(self, db): @@ -61,10 +62,9 @@ class Indexer(IndexerBase): self.db.cursor.execute(sql, (id, )) # ok, find all the unique words in the text - if not isinstance(text, unicode): - text = unicode(text, "utf-8", "replace") + text = us2u(text, "replace") text = text.upper() - wordlist = [w.encode("utf-8") + wordlist = [u2s(w) for w in re.findall(r'(?u)\b\w{%d,%d}\b' % (self.minlength, self.maxlength), text)] words = set() diff --git a/roundup/backends/indexer_whoosh.py b/roundup/backends/indexer_whoosh.py index 070777e..18f5ba1 100644 --- a/roundup/backends/indexer_whoosh.py +++ b/roundup/backends/indexer_whoosh.py @@ -5,6 +5,7 @@ import re, os from whoosh import fields, qparser, index, query, analysis from roundup.backends.indexer_common import Indexer as IndexerBase +from roundup.anypy.strings import us2u class Indexer(IndexerBase): def __init__(self, db): @@ -78,8 +79,7 @@ class Indexer(IndexerBase): if not text: text = u'' - if not isinstance(text, unicode): - text = unicode(text, "utf-8", "replace") + text = us2u(text, "replace") # We use the identifier twice: once in the actual "text" being # indexed so we can search on it, and again as the "data" being diff --git a/roundup/backends/rdbms_common.py b/roundup/backends/rdbms_common.py index 2b98b0b..2552e34 100644 --- a/roundup/backends/rdbms_common.py +++ b/roundup/backends/rdbms_common.py @@ -69,6 +69,7 @@ from roundup.backends.sessions_rdbms import Sessions, OneTimeKeys from roundup.date import Range from roundup.backends.back_anydbm import compile_expression +from roundup.anypy.strings import us2s # dummy value meaning "argument not passed" @@ -2944,8 +2945,7 @@ class Class(hyperdb.Class): elif isinstance(prop, hyperdb.Password): value = password.Password(encrypted=value) elif isinstance(prop, String): - if isinstance(value, unicode): - value = value.encode('utf8') + value = us2s(value) if not isinstance(value, str): raise TypeError('new property "%(propname)s" not a ' 'string: %(value)r'%locals()) diff --git a/roundup/cgi/PageTemplates/TALES.py b/roundup/cgi/PageTemplates/TALES.py index 3462681..a94870e 100644 --- a/roundup/cgi/PageTemplates/TALES.py +++ b/roundup/cgi/PageTemplates/TALES.py @@ -231,7 +231,7 @@ class Context: text = self.evaluate(expr) if text is Default or text is None: return text - if isinstance(text, unicode): + if isinstance(text, type(u'')): return text else: return ustr(text) diff --git a/roundup/cgi/TranslationService.py b/roundup/cgi/TranslationService.py index 1d817c1..712d346 100644 --- a/roundup/cgi/TranslationService.py +++ b/roundup/cgi/TranslationService.py @@ -16,13 +16,12 @@ from roundup import i18n from roundup.cgi.PageTemplates import Expressions, PathIterator, TALES from roundup.cgi.TAL import TALInterpreter +from roundup.anypy.strings import us2u, u2s ### Translation classes class TranslationServiceMixin: - OUTPUT_ENCODING = "utf-8" - def translate(self, domain, msgid, mapping=None, context=None, target_language=None, default=None ): @@ -32,18 +31,15 @@ class TranslationServiceMixin: return _msg def gettext(self, msgid): - if not isinstance(msgid, unicode): - msgid = unicode(msgid, 'utf8') + msgid = us2u(msgid) msgtrans=self.ugettext(msgid) - return msgtrans.encode(self.OUTPUT_ENCODING) + return u2s(msgtrans) def ngettext(self, singular, plural, number): - if not isinstance(singular, unicode): - singular = unicode(singular, 'utf8') - if not isinstance(plural, unicode): - plural = unicode(plural, 'utf8') + singular = us2u(singular) + plural = us2u(plural) msgtrans=self.ungettext(singular, plural, number) - return msgtrans.encode(self.OUTPUT_ENCODING) + return u2s(msgtrans) class TranslationService(TranslationServiceMixin, i18n.RoundupTranslations): pass @@ -55,8 +51,7 @@ class NullTranslationService(TranslationServiceMixin, return self._fallback.ugettext(message) # Sometimes the untranslatable message is a UTF-8 encoded string # (thanks to PageTemplate's internals). - if not isinstance(message, unicode): - return unicode(message, 'utf8') + message = us2u(message) return message ### TAL patching diff --git a/roundup/cgi/engine_chameleon.py b/roundup/cgi/engine_chameleon.py index 4e6782c..db6b09c 100644 --- a/roundup/cgi/engine_chameleon.py +++ b/roundup/cgi/engine_chameleon.py @@ -6,6 +6,7 @@ import os.path import chameleon from roundup.cgi.templating import StringIO, context, TALLoaderBase +from roundup.anypy.strings import s2u class Loader(TALLoaderBase): def __init__(self, dir): @@ -27,7 +28,7 @@ class RoundupPageTemplate(object): def translate(msgid, domain=None, mapping=None, default=None): result = client.translator.translate(domain, msgid, mapping=mapping, default=default) - return unicode(result, client.translator.OUTPUT_ENCODING) + return s2u(result) output = self._pt.render(None, translate, **c) return output.encode(client.charset) diff --git a/roundup/cgi/engine_jinja2.py b/roundup/cgi/engine_jinja2.py index 3d2c3bc..aa36b82 100644 --- a/roundup/cgi/engine_jinja2.py +++ b/roundup/cgi/engine_jinja2.py @@ -40,6 +40,7 @@ from types import MethodType # http://jinja.pocoo.org/docs/api/#loaders from roundup.cgi.templating import context, LoaderBase, TemplateBase +from roundup.anypy.strings import s2u class Jinja2Loader(LoaderBase): def __init__(self, dir): @@ -59,8 +60,7 @@ class Jinja2Loader(LoaderBase): # The automatic conversion will assume 'ascii' and fail sometime. # Analysed with roundup 1.5.0 and jinja 2.7.1. See issue2550811. self._env.filters["u"] = lambda s: \ - unicode(s(), "utf-8") if type(s) == MethodType \ - else unicode(s, "utf-8") + s2u(s()) if type(s) == MethodType else s2u(s) def check(self, tplname): #print tplname diff --git a/roundup/cgi/templating.py b/roundup/cgi/templating.py index 4d444ec..79ebd4b 100644 --- a/roundup/cgi/templating.py +++ b/roundup/cgi/templating.py @@ -29,6 +29,7 @@ from roundup.anypy import urllib_ from roundup import hyperdb, date, support from roundup import i18n from roundup.i18n import _ +from roundup.anypy.strings import is_us, us2s, s2u, u2s from .KeywordsExpr import render_keywords_expression_editor @@ -1772,7 +1773,7 @@ class BooleanHTMLProperty(HTMLProperty): return self.plain(escape=1) value = self._value - if isinstance(value, str) or isinstance(value, unicode): + if is_us(value): value = value.strip().lower() in ('checked', 'yes', 'true', 'on', '1') @@ -1825,8 +1826,7 @@ class DateHTMLProperty(HTMLProperty): anonymous=0, offset=None): HTMLProperty.__init__(self, client, classname, nodeid, prop, name, value, anonymous=anonymous) - if self._value and not (isinstance(self._value, str) or - isinstance(self._value, unicode)): + if self._value and not is_us(self._value): self._value.setTranslator(self._client.translator) self._offset = offset if self._offset is None : @@ -1908,9 +1908,9 @@ class DateHTMLProperty(HTMLProperty): raise ValueError(self._('default value for ' 'DateHTMLProperty must be either DateHTMLProperty ' 'or string date representation.')) - elif isinstance(value, str) or isinstance(value, unicode): + elif is_us(value): # most likely erroneous input to be passed back to user - if isinstance(value, unicode): value = value.encode('utf8') + value = us2s(value) s = self.input(name=self._formname, value=value, size=size, **kwargs) if popcal: @@ -1921,7 +1921,7 @@ class DateHTMLProperty(HTMLProperty): if raw_value is None: value = '' - elif isinstance(raw_value, str) or isinstance(raw_value, unicode): + elif is_us(raw_value): if format is self._marker: value = raw_value else: @@ -2010,7 +2010,7 @@ class IntervalHTMLProperty(HTMLProperty): anonymous=0): HTMLProperty.__init__(self, client, classname, nodeid, prop, name, value, anonymous) - if self._value and not isinstance(self._value, (str, unicode)): + if self._value and not is_us(self._value): self._value.setTranslator(self._client.translator) def plain(self, escape=0): @@ -2965,9 +2965,9 @@ function help_window(helpurl, width, height) { klass = self.client.db.getclass(self.classname) if self.search_text: matches = self.client.db.indexer.search( - [w.upper().encode("utf-8", "replace") for w in re.findall( + [u2s(w.upper()) for w in re.findall( r'(?u)\b\w{2,25}\b', - unicode(self.search_text, "utf-8", "replace") + s2u(self.search_text, "replace") )], klass) else: matches = None diff --git a/roundup/configuration.py b/roundup/configuration.py index 64434ad..3596898 100644 --- a/roundup/configuration.py +++ b/roundup/configuration.py @@ -540,8 +540,9 @@ class RegExpOption(Option): return value.pattern def str2value(self, value): - if not isinstance(value, unicode): + if not isinstance(value, type(u'')): value = str(value) + if not isinstance(value, type(u'')): # if it is 7-bit ascii, use it as string, # otherwise convert to unicode. try: diff --git a/roundup/dehtml.py b/roundup/dehtml.py index dd959ca..6682300 100644 --- a/roundup/dehtml.py +++ b/roundup/dehtml.py @@ -1,5 +1,6 @@ from __future__ import print_function +from roundup.anypy.strings import u2s class dehtml: def __init__(self, converter): if converter == "none": @@ -17,7 +18,7 @@ class dehtml: for script in soup(["script", "style"]): script.extract() - return soup.get_text('\n', strip=True).encode('utf-8') + return u2s(soup.get_text('\n', strip=True)) self.html2text = html2text else: diff --git a/roundup/i18n.py b/roundup/i18n.py index e1c05f3..80e331c 100644 --- a/roundup/i18n.py +++ b/roundup/i18n.py @@ -40,6 +40,7 @@ import gettext as gettext_module import os from roundup import msgfmt +from roundup.anypy.strings import is_us # List of directories for mo file search (see SF bug 1219689) LOCALE_DIRS = [ @@ -79,7 +80,7 @@ def find_locales(language=None): if val: languages = val.split(':') break - elif isinstance(language, str) or isinstance(language, unicode): + elif is_us(language): languages = [language] else: # 'language' must be iterable diff --git a/roundup/mailer.py b/roundup/mailer.py index cac8d55..3e8953a 100644 --- a/roundup/mailer.py +++ b/roundup/mailer.py @@ -17,6 +17,7 @@ from email.mime.text import MIMEText from email.mime.multipart import MIMEMultipart from roundup.anypy import email_ +from roundup.anypy.strings import s2u try: import pyme, pyme.core @@ -85,12 +86,12 @@ class Mailer: ''' # encode header values if they need to be charset = getattr(self.config, 'EMAIL_CHARSET', 'utf-8') - tracker_name = unicode(self.config.TRACKER_NAME, 'utf-8') + tracker_name = s2u(self.config.TRACKER_NAME) if not author: author = (tracker_name, self.config.ADMIN_EMAIL) name = author[0] else: - name = unicode(author[0], 'utf-8') + name = s2u(author[0]) author = nice_sender_header(name, author[1], charset) try: message['Subject'] = subject.encode('ascii') diff --git a/roundup/mailgw.py b/roundup/mailgw.py index 0d891e9..0432b58 100644 --- a/roundup/mailgw.py +++ b/roundup/mailgw.py @@ -97,6 +97,7 @@ __docformat__ = 'restructuredtext' import string, re, os, mimetools, cStringIO, smtplib, socket, binascii, quopri import time, random, sys, logging +import codecs import traceback import email.utils @@ -343,7 +344,7 @@ class Message(mimetools.Message): charset = charset.lower().replace("windows-", 'cp') # Do conversion only if charset specified - handle # badly-specified charsets - edata = unicode(data, charset, 'replace').encode('utf-8') + edata = codecs.decode(data, charset, 'replace').encode('utf-8') # Convert from dos eol to unix edata = edata.replace('\r\n', '\n') else: diff --git a/roundup/password.py b/roundup/password.py index 45c446d..0d5de08 100644 --- a/roundup/password.py +++ b/roundup/password.py @@ -24,6 +24,8 @@ import os from base64 import b64encode, b64decode from hashlib import md5, sha1 +from roundup.anypy.strings import us2s, s2b + try: import crypt except ImportError: @@ -105,10 +107,8 @@ def pbkdf2(password, salt, rounds, keylen): :returns: raw bytes of generated key """ - if isinstance(password, unicode): - password = password.encode("utf-8") - if isinstance(salt, unicode): - salt = salt.encode("utf-8") + password = s2b(us2s(password)) + salt = s2b(us2s(salt)) if keylen > 40: #NOTE: pbkdf2 allows up to (2**31-1)*20 bytes, # but m2crypto has issues on some platforms above 40, @@ -126,8 +126,7 @@ def pbkdf2_unpack(pbkdf2): """ unpack pbkdf2 encrypted password into parts, assume it has format "{rounds}${salt}${digest} """ - if isinstance(pbkdf2, unicode): - pbkdf2 = pbkdf2.encode("ascii") + pbkdf2 = us2s(pbkdf2) try: rounds, salt, digest = pbkdf2.split("$") except ValueError: diff --git a/roundup/roundupdb.py b/roundup/roundupdb.py index ac398eb..70f44ec 100644 --- a/roundup/roundupdb.py +++ b/roundup/roundupdb.py @@ -39,6 +39,8 @@ from roundup.hyperdb import iter_roles from roundup.mailer import Mailer, MessageSendError, encode_quopri, \ nice_sender_header +from roundup.anypy.strings import s2u + try: import pyme, pyme.core # gpgme_check_version() must have been called once in a programm @@ -494,7 +496,7 @@ class IssueClass: charset = getattr(self.db.config, 'EMAIL_CHARSET', 'utf-8') # construct the content and convert to unicode object - body = unicode('\n'.join(m), 'utf-8').encode(charset) + body = s2u('\n'.join(m)).encode(charset) # make sure the To line is always the same (for testing mostly) sendto.sort() @@ -520,7 +522,7 @@ class IssueClass: sendto = [sendto] # tracker sender info - tracker_name = unicode(self.db.config.TRACKER_NAME, 'utf-8') + tracker_name = s2u(self.db.config.TRACKER_NAME) tracker_name = nice_sender_header(tracker_name, from_address, charset) diff --git a/roundup/xmlrpc.py b/roundup/xmlrpc.py index 6a89837..cade01c 100644 --- a/roundup/xmlrpc.py +++ b/roundup/xmlrpc.py @@ -12,6 +12,7 @@ from roundup import actions from roundup.anypy import xmlrpc_ SimpleXMLRPCDispatcher = xmlrpc_.server.SimpleXMLRPCDispatcher Binary = xmlrpc_.client.Binary +from roundup.anypy.strings import us2s from traceback import format_exc def translate(value): @@ -41,13 +42,8 @@ def props_from_args(db, cl, args, itemid=None): key, value = arg.split('=', 1) except ValueError : raise UsageError('argument "%s" not propname=value'%arg) - if isinstance(key, unicode): - try: - key = key.encode ('ascii') - except UnicodeEncodeError: - raise UsageError('argument %r is no valid ascii keyword'%key) - if isinstance(value, unicode): - value = value.encode('utf-8') + key = us2s(key) + value = us2s(value) if value: try: props[key] = hyperdb.rawToHyperdb(db, cl, itemid, diff --git a/scripts/import_sf.py b/scripts/import_sf.py index 85d7d09..bb93f88 100644 --- a/scripts/import_sf.py +++ b/scripts/import_sf.py @@ -30,6 +30,7 @@ except ImportError: from roundup import instance, hyperdb, date, support, password from roundup.anypy import http_, urllib_ +from roundup.anypy.strings import s2b, us2s today = date.Date('.') @@ -295,7 +296,7 @@ def import_xml(tracker_home, xml_file, file_dir): files.append(fid) name = name.strip() try: - f = open(os.path.join(file_dir, fid)) + f = open(os.path.join(file_dir, fid), 'rb') content = f.read() f.close() except: @@ -384,11 +385,11 @@ def write_csv(klass, data): if isinstance(klass, hyperdb.FileClass) and entry.get('content'): fname = klass.exportFilename('/tmp/imported/', entry['id']) support.ensureParentsExist(fname) - c = open(fname, 'w') - if isinstance(entry['content'], unicode): - c.write(entry['content'].encode('utf8')) - else: + c = open(fname, 'wb') + if isinstance(entry['content'], bytes): c.write(entry['content']) + else: + c.write(s2b(us2s(entry['content']))) c.close() f.close() -- 2.7.4