--- old_indexer_xapian.py 2016-01-11 22:53:59.000000000 +0800 +++ new_indexer_xapian.py 2016-02-25 12:05:34.502876251 +0800 @@ -2,6 +2,8 @@ ''' import re, os +from mmseg.search import seg_txt_search,seg_txt_2_dict + import xapian from roundup.backends.indexer_common import Indexer as IndexerBase @@ -57,7 +59,7 @@ # open the database and start a transaction if needed database = self._get_database() - # XXX: Xapian now supports transactions, + # XXX: Xapian now supports transactions, # but there is a call to save_index() missing. #if not self.transaction_active: #database.begin_transaction() @@ -76,6 +78,19 @@ doc.set_data(identifier) doc.add_term(identifier, 0) + # get Chinese words + # Find all Chinese text in a string using Python and Regex - Stack Overflow - + # http://stackoverflow.com/questions/2718196/find-all-chinese-text-in-a-string-using-python-and-regex + utext = text.decode('utf-8') + for chinese_word in re.findall(ur'[\u4e00-\u9fff]+',utext): + #print chinese_word + utf8_chinese_word = chinese_word.encode('utf-8') + #print utf8_chinese_word + for word, value in seg_txt_2_dict(utf8_chinese_word).iteritems(): + #print word, value + doc.add_term(word, value) + + for match in re.finditer(r'\b\w{%d,%d}\b' % (self.minlength, self.maxlength), text.upper()):