From dbaccf70ace2bf6eb9fcac00c864bbeaec72cd13 Mon Sep 17 00:00:00 2001
From: Joseph Myers <jsm@polyomino.org.uk>
Date: Mon, 18 Jun 2018 00:50:47 +0000
Subject: [PATCH 42/54] Python 3 preparation: unicode.

---
 roundup/anypy/strings.py           | 75 ++++++++++++++++++++++++++++++++++++++
 roundup/backends/back_sqlite.py    |  3 +-
 roundup/backends/indexer_rdbms.py  |  6 +--
 roundup/backends/indexer_whoosh.py |  4 +-
 roundup/backends/rdbms_common.py   |  4 +-
 roundup/cgi/PageTemplates/TALES.py |  2 +-
 roundup/cgi/TranslationService.py  | 19 ++++------
 roundup/cgi/engine_chameleon.py    |  3 +-
 roundup/cgi/engine_jinja2.py       |  4 +-
 roundup/cgi/templating.py          | 18 ++++-----
 roundup/configuration.py           |  3 +-
 roundup/dehtml.py                  |  3 +-
 roundup/i18n.py                    |  3 +-
 roundup/mailer.py                  |  5 ++-
 roundup/mailgw.py                  |  3 +-
 roundup/password.py                | 11 +++---
 roundup/roundupdb.py               |  6 ++-
 roundup/xmlrpc.py                  | 10 ++---
 scripts/import_sf.py               | 11 +++---
 19 files changed, 134 insertions(+), 59 deletions(-)
 create mode 100644 roundup/anypy/strings.py

diff --git a/roundup/anypy/strings.py b/roundup/anypy/strings.py
new file mode 100644
index 0000000..2429b3b
--- /dev/null
+++ b/roundup/anypy/strings.py
@@ -0,0 +1,75 @@
+# Roundup represents text internally using the native Python str type.
+# In Python 3, these are Unicode strings.  In Python 2, these are
+# encoded using UTF-8, and the Python 2 unicode type is only used in a
+# few places, generally for interacting with external modules
+# requiring that type to be used.
+
+import sys
+_py3 = sys.version_info[0] > 2
+
+def b2s(b):
+    """Convert a UTF-8 encoded bytes object to the internal string format."""
+    if _py3:
+        return b.decode('utf-8')
+    else:
+        return b
+
+def s2b(s):
+    """Convert a string object to UTF-8 encoded bytes."""
+    if _py3:
+        return s.encode('utf-8')
+    else:
+        return s
+
+def s2u(s, errors='strict'):
+    """Convert a string object to a Unicode string."""
+    if _py3:
+        return s
+    else:
+        return unicode(s, 'utf-8', errors)
+
+def u2s(u):
+    """Convert a Unicode string to the internal string format."""
+    if _py3:
+        return u
+    else:
+        return u.encode('utf-8')
+
+def us2u(s, errors='strict'):
+    """Convert a string or Unicode string to a Unicode string."""
+    if _py3:
+        return s
+    else:
+        if isinstance(s, unicode):
+            return s
+        else:
+            return unicode(s, 'utf-8', errors)
+
+def us2s(u):
+    """Convert a string or Unicode string to the internal string format."""
+    if _py3:
+        return u
+    else:
+        if isinstance(u, unicode):
+            return u.encode('utf-8')
+        else:
+            return u
+
+def uany2s(u):
+    """Convert a Unicode string or other object to the internal string format.
+
+    Objects that are not Unicode strings are passed to str()."""
+    if _py3:
+        return str(u)
+    else:
+        if isinstance(u, unicode):
+            return u.encode('utf-8')
+        else:
+            return str(u)
+
+def is_us(s):
+    """Return whether an object is a string or Unicode string."""
+    if _py3:
+        return isinstance(s, str)
+    else:
+        return isinstance(s, str) or isinstance(s, unicode)
diff --git a/roundup/backends/back_sqlite.py b/roundup/backends/back_sqlite.py
index 002f0ba..214286b 100644
--- a/roundup/backends/back_sqlite.py
+++ b/roundup/backends/back_sqlite.py
@@ -13,6 +13,7 @@ import os, base64, marshal, shutil, time, logging
 from roundup import hyperdb, date, password
 from roundup.backends import rdbms_common
 from roundup.backends.sessions_dbm import Sessions, OneTimeKeys
+from roundup.anypy.strings import uany2s
 
 sqlite_version = None
 try:
@@ -85,7 +86,7 @@ class Database(rdbms_common.Database):
         hyperdb.Multilink : lambda x: x,    # used in journal marshalling
     }
     sql_to_hyperdb_value = {
-        hyperdb.String : lambda x: isinstance(x, unicode) and x.encode('utf8') or str(x),
+        hyperdb.String : uany2s,
         hyperdb.Date   : lambda x: date.Date(str(x)),
         hyperdb.Link   : str, # XXX numeric ids
         hyperdb.Interval  : date.Interval,
diff --git a/roundup/backends/indexer_rdbms.py b/roundup/backends/indexer_rdbms.py
index 6b99035..a6c7392 100644
--- a/roundup/backends/indexer_rdbms.py
+++ b/roundup/backends/indexer_rdbms.py
@@ -5,6 +5,7 @@ propname, itemid) instances.
 import re
 
 from roundup.backends.indexer_common import Indexer as IndexerBase
+from roundup.anypy.strings import us2u, u2s
 
 class Indexer(IndexerBase):
     def __init__(self, db):
@@ -61,10 +62,9 @@ class Indexer(IndexerBase):
             self.db.cursor.execute(sql, (id, ))
 
         # ok, find all the unique words in the text
-        if not isinstance(text, unicode):
-            text = unicode(text, "utf-8", "replace")
+        text = us2u(text, "replace")
         text = text.upper()
-        wordlist = [w.encode("utf-8")
+        wordlist = [u2s(w)
                     for w in re.findall(r'(?u)\b\w{%d,%d}\b'
                                         % (self.minlength, self.maxlength), text)]
         words = set()
diff --git a/roundup/backends/indexer_whoosh.py b/roundup/backends/indexer_whoosh.py
index 070777e..18f5ba1 100644
--- a/roundup/backends/indexer_whoosh.py
+++ b/roundup/backends/indexer_whoosh.py
@@ -5,6 +5,7 @@ import re, os
 from whoosh import fields, qparser, index, query, analysis
 
 from roundup.backends.indexer_common import Indexer as IndexerBase
+from roundup.anypy.strings import us2u
 
 class Indexer(IndexerBase):
     def __init__(self, db):
@@ -78,8 +79,7 @@ class Indexer(IndexerBase):
         if not text:
             text = u''
 
-        if not isinstance(text, unicode):
-            text = unicode(text, "utf-8", "replace")
+        text = us2u(text, "replace")
 
         # We use the identifier twice: once in the actual "text" being
         # indexed so we can search on it, and again as the "data" being
diff --git a/roundup/backends/rdbms_common.py b/roundup/backends/rdbms_common.py
index 2b98b0b..2552e34 100644
--- a/roundup/backends/rdbms_common.py
+++ b/roundup/backends/rdbms_common.py
@@ -69,6 +69,7 @@ from roundup.backends.sessions_rdbms import Sessions, OneTimeKeys
 from roundup.date import Range
 
 from roundup.backends.back_anydbm import compile_expression
+from roundup.anypy.strings import us2s
 
 
 # dummy value meaning "argument not passed"
@@ -2944,8 +2945,7 @@ class Class(hyperdb.Class):
             elif isinstance(prop, hyperdb.Password):
                 value = password.Password(encrypted=value)
             elif isinstance(prop, String):
-                if isinstance(value, unicode):
-                    value = value.encode('utf8')
+                value = us2s(value)
                 if not isinstance(value, str):
                     raise TypeError('new property "%(propname)s" not a '
                         'string: %(value)r'%locals())
diff --git a/roundup/cgi/PageTemplates/TALES.py b/roundup/cgi/PageTemplates/TALES.py
index 3462681..a94870e 100644
--- a/roundup/cgi/PageTemplates/TALES.py
+++ b/roundup/cgi/PageTemplates/TALES.py
@@ -231,7 +231,7 @@ class Context:
         text = self.evaluate(expr)
         if text is Default or text is None:
             return text
-        if isinstance(text, unicode):
+        if isinstance(text, type(u'')):
             return text
         else:
             return ustr(text)
diff --git a/roundup/cgi/TranslationService.py b/roundup/cgi/TranslationService.py
index 1d817c1..712d346 100644
--- a/roundup/cgi/TranslationService.py
+++ b/roundup/cgi/TranslationService.py
@@ -16,13 +16,12 @@
 from roundup import i18n
 from roundup.cgi.PageTemplates import Expressions, PathIterator, TALES
 from roundup.cgi.TAL import TALInterpreter
+from roundup.anypy.strings import us2u, u2s
 
 ### Translation classes
 
 class TranslationServiceMixin:
 
-    OUTPUT_ENCODING = "utf-8"
-
     def translate(self, domain, msgid, mapping=None,
         context=None, target_language=None, default=None
     ):
@@ -32,18 +31,15 @@ class TranslationServiceMixin:
         return _msg
 
     def gettext(self, msgid):
-        if not isinstance(msgid, unicode):
-            msgid = unicode(msgid, 'utf8')
+        msgid = us2u(msgid)
         msgtrans=self.ugettext(msgid)
-        return msgtrans.encode(self.OUTPUT_ENCODING)
+        return u2s(msgtrans)
 
     def ngettext(self, singular, plural, number):
-        if not isinstance(singular, unicode):
-            singular = unicode(singular, 'utf8')
-        if not isinstance(plural, unicode):
-            plural = unicode(plural, 'utf8')
+        singular = us2u(singular)
+        plural = us2u(plural)
         msgtrans=self.ungettext(singular, plural, number)
-        return msgtrans.encode(self.OUTPUT_ENCODING)
+        return u2s(msgtrans)
 
 class TranslationService(TranslationServiceMixin, i18n.RoundupTranslations):
     pass
@@ -55,8 +51,7 @@ class NullTranslationService(TranslationServiceMixin,
             return self._fallback.ugettext(message)
         # Sometimes the untranslatable message is a UTF-8 encoded string
         # (thanks to PageTemplate's internals).
-        if not isinstance(message, unicode):
-            return unicode(message, 'utf8')
+        message = us2u(message)
         return message
 
 ### TAL patching
diff --git a/roundup/cgi/engine_chameleon.py b/roundup/cgi/engine_chameleon.py
index 4e6782c..db6b09c 100644
--- a/roundup/cgi/engine_chameleon.py
+++ b/roundup/cgi/engine_chameleon.py
@@ -6,6 +6,7 @@ import os.path
 import chameleon
 
 from roundup.cgi.templating import StringIO, context, TALLoaderBase
+from roundup.anypy.strings import s2u
 
 class Loader(TALLoaderBase):
     def __init__(self, dir):
@@ -27,7 +28,7 @@ class RoundupPageTemplate(object):
         def translate(msgid, domain=None, mapping=None, default=None):
             result = client.translator.translate(domain, msgid,
                          mapping=mapping, default=default)
-            return unicode(result, client.translator.OUTPUT_ENCODING)
+            return s2u(result)
 
         output = self._pt.render(None, translate, **c)
         return output.encode(client.charset)
diff --git a/roundup/cgi/engine_jinja2.py b/roundup/cgi/engine_jinja2.py
index 3d2c3bc..aa36b82 100644
--- a/roundup/cgi/engine_jinja2.py
+++ b/roundup/cgi/engine_jinja2.py
@@ -40,6 +40,7 @@ from types import MethodType
 # http://jinja.pocoo.org/docs/api/#loaders
 
 from roundup.cgi.templating import context, LoaderBase, TemplateBase
+from roundup.anypy.strings import s2u
 
 class Jinja2Loader(LoaderBase):
     def __init__(self, dir):
@@ -59,8 +60,7 @@ class Jinja2Loader(LoaderBase):
         # The automatic conversion will assume 'ascii' and fail sometime.
         # Analysed with roundup 1.5.0 and jinja 2.7.1. See issue2550811.
         self._env.filters["u"] = lambda s: \
-            unicode(s(), "utf-8") if type(s) == MethodType \
-                                  else unicode(s, "utf-8")
+            s2u(s()) if type(s) == MethodType else s2u(s)
 
     def check(self, tplname):
         #print tplname
diff --git a/roundup/cgi/templating.py b/roundup/cgi/templating.py
index 4d444ec..79ebd4b 100644
--- a/roundup/cgi/templating.py
+++ b/roundup/cgi/templating.py
@@ -29,6 +29,7 @@ from roundup.anypy import urllib_
 from roundup import hyperdb, date, support
 from roundup import i18n
 from roundup.i18n import _
+from roundup.anypy.strings import is_us, us2s, s2u, u2s
 
 from .KeywordsExpr import render_keywords_expression_editor
 
@@ -1772,7 +1773,7 @@ class BooleanHTMLProperty(HTMLProperty):
             return self.plain(escape=1)
 
         value = self._value
-        if isinstance(value, str) or isinstance(value, unicode):
+        if is_us(value):
             value = value.strip().lower() in ('checked', 'yes', 'true',
                 'on', '1')
 
@@ -1825,8 +1826,7 @@ class DateHTMLProperty(HTMLProperty):
             anonymous=0, offset=None):
         HTMLProperty.__init__(self, client, classname, nodeid, prop, name,
                 value, anonymous=anonymous)
-        if self._value and not (isinstance(self._value, str) or
-                isinstance(self._value, unicode)):
+        if self._value and not is_us(self._value):
             self._value.setTranslator(self._client.translator)
         self._offset = offset
         if self._offset is None :
@@ -1908,9 +1908,9 @@ class DateHTMLProperty(HTMLProperty):
                     raise ValueError(self._('default value for '
                         'DateHTMLProperty must be either DateHTMLProperty '
                         'or string date representation.'))
-        elif isinstance(value, str) or isinstance(value, unicode):
+        elif is_us(value):
             # most likely erroneous input to be passed back to user
-            if isinstance(value, unicode): value = value.encode('utf8')
+            value = us2s(value)
             s = self.input(name=self._formname, value=value, size=size,
                               **kwargs)
             if popcal:
@@ -1921,7 +1921,7 @@ class DateHTMLProperty(HTMLProperty):
 
         if raw_value is None:
             value = ''
-        elif isinstance(raw_value, str) or isinstance(raw_value, unicode):
+        elif is_us(raw_value):
             if format is self._marker:
                 value = raw_value
             else:
@@ -2010,7 +2010,7 @@ class IntervalHTMLProperty(HTMLProperty):
             anonymous=0):
         HTMLProperty.__init__(self, client, classname, nodeid, prop,
             name, value, anonymous)
-        if self._value and not isinstance(self._value, (str, unicode)):
+        if self._value and not is_us(self._value):
             self._value.setTranslator(self._client.translator)
 
     def plain(self, escape=0):
@@ -2965,9 +2965,9 @@ function help_window(helpurl, width, height) {
         klass = self.client.db.getclass(self.classname)
         if self.search_text:
             matches = self.client.db.indexer.search(
-                [w.upper().encode("utf-8", "replace") for w in re.findall(
+                [u2s(w.upper()) for w in re.findall(
                     r'(?u)\b\w{2,25}\b',
-                    unicode(self.search_text, "utf-8", "replace")
+                    s2u(self.search_text, "replace")
                 )], klass)
         else:
             matches = None
diff --git a/roundup/configuration.py b/roundup/configuration.py
index 64434ad..3596898 100644
--- a/roundup/configuration.py
+++ b/roundup/configuration.py
@@ -540,8 +540,9 @@ class RegExpOption(Option):
         return value.pattern
 
     def str2value(self, value):
-        if not isinstance(value, unicode):
+        if not isinstance(value, type(u'')):
             value = str(value)
+        if not isinstance(value, type(u'')):
             # if it is 7-bit ascii, use it as string,
             # otherwise convert to unicode.
             try:
diff --git a/roundup/dehtml.py b/roundup/dehtml.py
index dd959ca..6682300 100644
--- a/roundup/dehtml.py
+++ b/roundup/dehtml.py
@@ -1,5 +1,6 @@
 
 from __future__ import print_function
+from roundup.anypy.strings import u2s
 class dehtml:
     def __init__(self, converter):
         if converter == "none":
@@ -17,7 +18,7 @@ class dehtml:
                     for script in soup(["script", "style"]):
                         script.extract()
 
-                    return soup.get_text('\n', strip=True).encode('utf-8')
+                    return u2s(soup.get_text('\n', strip=True))
 
                 self.html2text = html2text
             else:
diff --git a/roundup/i18n.py b/roundup/i18n.py
index e1c05f3..80e331c 100644
--- a/roundup/i18n.py
+++ b/roundup/i18n.py
@@ -40,6 +40,7 @@ import gettext as gettext_module
 import os
 
 from roundup import msgfmt
+from roundup.anypy.strings import is_us
 
 # List of directories for mo file search (see SF bug 1219689)
 LOCALE_DIRS = [
@@ -79,7 +80,7 @@ def find_locales(language=None):
             if val:
                 languages = val.split(':')
                 break
-    elif isinstance(language, str) or  isinstance(language, unicode):
+    elif is_us(language):
         languages = [language]
     else:
         # 'language' must be iterable
diff --git a/roundup/mailer.py b/roundup/mailer.py
index cac8d55..3e8953a 100644
--- a/roundup/mailer.py
+++ b/roundup/mailer.py
@@ -17,6 +17,7 @@ from email.mime.text import MIMEText
 from email.mime.multipart import MIMEMultipart
 
 from roundup.anypy import email_
+from roundup.anypy.strings import s2u
 
 try:
     import pyme, pyme.core
@@ -85,12 +86,12 @@ class Mailer:
         '''
         # encode header values if they need to be
         charset = getattr(self.config, 'EMAIL_CHARSET', 'utf-8')
-        tracker_name = unicode(self.config.TRACKER_NAME, 'utf-8')
+        tracker_name = s2u(self.config.TRACKER_NAME)
         if not author:
             author = (tracker_name, self.config.ADMIN_EMAIL)
             name = author[0]
         else:
-            name = unicode(author[0], 'utf-8')
+            name = s2u(author[0])
         author = nice_sender_header(name, author[1], charset)
         try:
             message['Subject'] = subject.encode('ascii')
diff --git a/roundup/mailgw.py b/roundup/mailgw.py
index 0d891e9..0432b58 100644
--- a/roundup/mailgw.py
+++ b/roundup/mailgw.py
@@ -97,6 +97,7 @@ __docformat__ = 'restructuredtext'
 
 import string, re, os, mimetools, cStringIO, smtplib, socket, binascii, quopri
 import time, random, sys, logging
+import codecs
 import traceback
 import email.utils
 
@@ -343,7 +344,7 @@ class Message(mimetools.Message):
             charset = charset.lower().replace("windows-", 'cp')
             # Do conversion only if charset specified - handle
             # badly-specified charsets
-            edata = unicode(data, charset, 'replace').encode('utf-8')
+            edata = codecs.decode(data, charset, 'replace').encode('utf-8')
             # Convert from dos eol to unix
             edata = edata.replace('\r\n', '\n')
         else:
diff --git a/roundup/password.py b/roundup/password.py
index 45c446d..0d5de08 100644
--- a/roundup/password.py
+++ b/roundup/password.py
@@ -24,6 +24,8 @@ import os
 from base64 import b64encode, b64decode
 from hashlib import md5, sha1
 
+from roundup.anypy.strings import us2s, s2b
+
 try:
     import crypt
 except ImportError:
@@ -105,10 +107,8 @@ def pbkdf2(password, salt, rounds, keylen):
     :returns:
         raw bytes of generated key
     """
-    if isinstance(password, unicode):
-        password = password.encode("utf-8")
-    if isinstance(salt, unicode):
-        salt = salt.encode("utf-8")
+    password = s2b(us2s(password))
+    salt = s2b(us2s(salt))
     if keylen > 40:
         #NOTE: pbkdf2 allows up to (2**31-1)*20 bytes,
         # but m2crypto has issues on some platforms above 40,
@@ -126,8 +126,7 @@ def pbkdf2_unpack(pbkdf2):
     """ unpack pbkdf2 encrypted password into parts,
         assume it has format "{rounds}${salt}${digest}
     """
-    if isinstance(pbkdf2, unicode):
-        pbkdf2 = pbkdf2.encode("ascii")
+    pbkdf2 = us2s(pbkdf2)
     try:
         rounds, salt, digest = pbkdf2.split("$")
     except ValueError:
diff --git a/roundup/roundupdb.py b/roundup/roundupdb.py
index ac398eb..70f44ec 100644
--- a/roundup/roundupdb.py
+++ b/roundup/roundupdb.py
@@ -39,6 +39,8 @@ from roundup.hyperdb import iter_roles
 from roundup.mailer import Mailer, MessageSendError, encode_quopri, \
     nice_sender_header
 
+from roundup.anypy.strings import s2u
+
 try:
     import pyme, pyme.core
     # gpgme_check_version() must have been called once in a programm
@@ -494,7 +496,7 @@ class IssueClass:
         charset = getattr(self.db.config, 'EMAIL_CHARSET', 'utf-8')
 
         # construct the content and convert to unicode object
-        body = unicode('\n'.join(m), 'utf-8').encode(charset)
+        body = s2u('\n'.join(m)).encode(charset)
 
         # make sure the To line is always the same (for testing mostly)
         sendto.sort()
@@ -520,7 +522,7 @@ class IssueClass:
             sendto = [sendto]
 
         # tracker sender info
-        tracker_name = unicode(self.db.config.TRACKER_NAME, 'utf-8')
+        tracker_name = s2u(self.db.config.TRACKER_NAME)
         tracker_name = nice_sender_header(tracker_name, from_address,
             charset)
 
diff --git a/roundup/xmlrpc.py b/roundup/xmlrpc.py
index 6a89837..cade01c 100644
--- a/roundup/xmlrpc.py
+++ b/roundup/xmlrpc.py
@@ -12,6 +12,7 @@ from roundup import actions
 from roundup.anypy import xmlrpc_
 SimpleXMLRPCDispatcher = xmlrpc_.server.SimpleXMLRPCDispatcher
 Binary = xmlrpc_.client.Binary
+from roundup.anypy.strings import us2s
 from traceback import format_exc
 
 def translate(value):
@@ -41,13 +42,8 @@ def props_from_args(db, cl, args, itemid=None):
             key, value = arg.split('=', 1)
         except ValueError :
             raise UsageError('argument "%s" not propname=value'%arg)
-        if isinstance(key, unicode):
-            try:
-                key = key.encode ('ascii')
-            except UnicodeEncodeError:
-                raise UsageError('argument %r is no valid ascii keyword'%key)
-        if isinstance(value, unicode):
-            value = value.encode('utf-8')
+        key = us2s(key)
+        value = us2s(value)
         if value:
             try:
                 props[key] = hyperdb.rawToHyperdb(db, cl, itemid,
diff --git a/scripts/import_sf.py b/scripts/import_sf.py
index 85d7d09..bb93f88 100644
--- a/scripts/import_sf.py
+++ b/scripts/import_sf.py
@@ -30,6 +30,7 @@ except ImportError:
 
 from roundup import instance, hyperdb, date, support, password
 from roundup.anypy import http_, urllib_
+from roundup.anypy.strings import s2b, us2s
 
 today = date.Date('.')
 
@@ -295,7 +296,7 @@ def import_xml(tracker_home, xml_file, file_dir):
                     files.append(fid)
                     name = name.strip()
                     try:
-                        f = open(os.path.join(file_dir, fid))
+                        f = open(os.path.join(file_dir, fid), 'rb')
                         content = f.read()
                         f.close()
                     except:
@@ -384,11 +385,11 @@ def write_csv(klass, data):
         if isinstance(klass, hyperdb.FileClass) and entry.get('content'):
             fname = klass.exportFilename('/tmp/imported/', entry['id'])
             support.ensureParentsExist(fname)
-            c = open(fname, 'w')
-            if isinstance(entry['content'], unicode):
-                c.write(entry['content'].encode('utf8'))
-            else:
+            c = open(fname, 'wb')
+            if isinstance(entry['content'], bytes):
                 c.write(entry['content'])
+            else:
+                c.write(s2b(us2s(entry['content'])))
             c.close()
 
     f.close()
-- 
2.7.4