diff --git a/.hgignore b/.hgignore new file mode 100644 --- /dev/null +++ b/.hgignore @@ -0,0 +1,18 @@ +syntax: glob +*.class +*.o +*.pyc +*.sqlite3 +*.sw[op] +*~ +.DS_Store +bin-debug/* +bin-release/* +bin/* +tags +*.beam +test/* +verso-tracker/* +build/* +demo/* +*.tgz diff --git a/roundup/backends/back_anydbm.py b/roundup/backends/back_anydbm.py --- a/roundup/backends/back_anydbm.py +++ b/roundup/backends/back_anydbm.py @@ -42,10 +42,7 @@ from blobfiles import FileStorage from sessions_dbm import Sessions, OneTimeKeys -try: - from indexer_xapian import Indexer -except ImportError: - from indexer_dbm import Indexer +from indexer_common import get_indexer def db_exists(config): # check for the user db @@ -95,7 +92,7 @@ self.newnodes = {} # keep track of the new nodes by class self.destroyednodes = {}# keep track of the destroyed nodes by class self.transactions = [] - self.indexer = Indexer(self) + self.indexer = get_indexer(config, self) self.security = security.Security(self) os.umask(config.UMASK) diff --git a/roundup/backends/indexer_common.py b/roundup/backends/indexer_common.py --- a/roundup/backends/indexer_common.py +++ b/roundup/backends/indexer_common.py @@ -110,3 +110,37 @@ node_dict[linkprop].append(nodeid) return nodeids +def get_indexer(config, db): + indexer_name = getattr(config, "INDEXER", "") + + if not indexer_name: + # Try everything + try: + from indexer_xapian import Indexer + return Indexer(db) + except ImportError: + pass + + try: + from indexer_whoosh import Indexer + return Indexer(db) + except ImportError: + pass + + from indexer_dbm import Indexer + return Indexer(db) + + if indexer_name == "xapian": + from indexer_xapian import Indexer + return Indexer(db) + + if indexer_name == "whoosh": + from indexer_whoosh import Indexer + return Indexer(db) + + if indexer_name == "dbm": + from indexer_dbm import Indexer + return Indexer(db) + + raise AssertionError("Invalid indexer: %r" %(indexer_name)) + diff --git a/roundup/backends/indexer_whoosh.py b/roundup/backends/indexer_whoosh.py new file mode 100644 --- /dev/null +++ b/roundup/backends/indexer_whoosh.py @@ -0,0 +1,120 @@ +''' This implements the full-text indexer using Whoosh. +''' +import re, os + +from whoosh import fields, qparser, index, query + +from roundup.anypy.sets_ import set +from roundup.backends.indexer_common import Indexer as IndexerBase + +class Indexer(IndexerBase): + def __init__(self, db): + IndexerBase.__init__(self, db) + self.db_path = db.config.DATABASE + self.reindex = 0 + self.writer = None + self.index = None + self.deleted = set() + + def _get_index(self): + if self.index is None: + path = os.path.join(self.db_path, 'whoosh-index') + if not os.path.exists(path): + os.mkdir(path) + schema = fields.Schema(identifier=fields.ID(stored=True, + unique=True), + content=fields.TEXT) + index.create_in(path, schema) + self.index = index.open_dir(path) + return self.index + + def save_index(self): + '''Save the changes to the index.''' + if not self.writer: + return + self.writer.commit() + self.deleted = set() + self.writer = None + + def close(self): + '''close the indexing database''' + pass + + def rollback(self): + if not self.writer: + return + self.writer.cancel() + self.deleted = set() + self.writer = None + + def force_reindex(self): + '''Force a reindexing of the database. This essentially + empties the tables ids and index and sets a flag so + that the databases are reindexed''' + self.reindex = 1 + + def should_reindex(self): + '''returns True if the indexes need to be rebuilt''' + return self.reindex + + def _get_writer(self): + if self.writer is None: + self.writer = self._get_index().writer() + return self.writer + + def _get_searcher(self): + return self._get_index().searcher() + + def add_text(self, identifier, text, mime_type='text/plain'): + ''' "identifier" is (classname, itemid, property) ''' + if mime_type != 'text/plain': + return + + if not text: + text = u'' + + if not isinstance(text, unicode): + text = unicode(text, "utf-8", "replace") + + # We use the identifier twice: once in the actual "text" being + # indexed so we can search on it, and again as the "data" being + # indexed so we know what we're matching when we get results + identifier = u"%s:%s:%s"%identifier + + writer = self._get_writer() + + # Whoosh gets upset if a document is deleted twice in one transaction, + # so we keep a list of the documents we have so far deleted to make + # sure that we only delete them once. + if identifier not in self.deleted: + searcher = self._get_searcher() + results = searcher.search(query.Term("identifier", identifier), 1) + if len(results) > 0: + writer.delete_by_term("identifier", identifier) + self.deleted.add(identifier) + + # Note: use '.lower()' because it seems like Whoosh gets + # better results that way. + writer.add_document(identifier=identifier, content=text.lower()) + + def find(self, wordlist): + '''look up all the words in the wordlist. + If none are found return an empty dictionary + * more rules here + ''' + + wordlist = [ word for word in wordlist + if not self.is_stopword(word.upper()) ] + + if not wordlist: + return {} + + searcher = self._get_searcher() + q = query.And([ query.FuzzyTerm("content", word.lower()) + for word in wordlist ]) + + results = searcher.search(q) + + return [tuple(result["identifier"].split(':')) + for result in results] + diff --git a/roundup/backends/rdbms_common.py b/roundup/backends/rdbms_common.py --- a/roundup/backends/rdbms_common.py +++ b/roundup/backends/rdbms_common.py @@ -64,10 +64,7 @@ # support from blobfiles import FileStorage -try: - from indexer_xapian import Indexer -except ImportError: - from indexer_rdbms import Indexer +from indexer_common import get_indexer from sessions_rdbms import Sessions, OneTimeKeys from roundup.date import Range @@ -114,7 +111,7 @@ self.config, self.journaltag = config, journaltag self.dir = config.DATABASE self.classes = {} - self.indexer = Indexer(self) + self.indexer = get_indexer(config, self) self.security = security.Security(self) # additional transaction support for external files and the like diff --git a/roundup/configuration.py b/roundup/configuration.py --- a/roundup/configuration.py +++ b/roundup/configuration.py @@ -524,6 +524,11 @@ "email?"), (BooleanOption, "email_registration_confirmation", "yes", "Offer registration confirmation by email or only through the web?"), + (Option, "indexer", "", + "Force Roundup to use a particular text indexer.\n" + "If no indexer is supplied, the first available indexer\n" + "will be used.\n" + "Possible values: xapian, whoosh, dbm (internal)."), (WordListOption, "indexer_stopwords", "", "Additional stop-words for the full-text indexer specific to\n" "your tracker. See the indexer source for the default list of\n" diff --git a/test/test_indexer.py b/test/test_indexer.py --- a/test/test_indexer.py +++ b/test/test_indexer.py @@ -135,6 +135,16 @@ def tearDown(self): shutil.rmtree('test-index') +class WhooshIndexerTest(IndexerTest): + def setUp(self): + if os.path.exists('test-index'): + shutil.rmtree('test-index') + os.mkdir('test-index') + from roundup.backends.indexer_whoosh import Indexer + self.dex = Indexer(db) + def tearDown(self): + shutil.rmtree('test-index') + class XapianIndexerTest(IndexerTest): def setUp(self): if os.path.exists('test-index'):