Index: roundup/backends/indexer_xapian.py =================================================================== --- roundup/backends/indexer_xapian.py (revision 4351) +++ roundup/backends/indexer_xapian.py (working copy) @@ -88,7 +88,9 @@ doc.set_data(identifier) doc.add_posting(identifier, 0) - for match in re.finditer(r'\b\w{2,25}\b', text.upper()): + for match in re.finditer(r'\b\w{%d,%d}\b' + % (self.minlength, self.maxlength), + text.upper()): word = match.group(0) if self.is_stopword(word): continue Index: roundup/backends/indexer_common.py =================================================================== --- roundup/backends/indexer_common.py (revision 4351) +++ roundup/backends/indexer_common.py (working copy) @@ -22,6 +22,10 @@ self.stopwords = set(STOPWORDS) for word in db.config[('main', 'indexer_stopwords')]: self.stopwords.add(word) + # Do not index anything longer than 25 characters since that'll be + # gibberish (encoded text or somesuch) or shorter than 2 characters + self.minlength = 2 + self.maxlength = 25 def is_stopword(self, word): return word in self.stopwords Index: roundup/backends/indexer_dbm.py =================================================================== --- roundup/backends/indexer_dbm.py (revision 4351) +++ roundup/backends/indexer_dbm.py (working copy) @@ -135,14 +135,12 @@ # case insensitive text = str(text).upper() - # Split the raw text, losing anything longer than 25 characters - # since that'll be gibberish (encoded text or somesuch) or shorter - # than 3 characters since those short words appear all over the - # place - return re.findall(r'\b\w{2,25}\b', text) + # Split the raw text + return re.findall(r'\b\w{%d,%d}\b' % (self.minlength, self.maxlength), + text) - # we override this to ignore not 2 < word < 25 and also to fix a bug - - # the (fail) case. + # we override this to ignore too short and too long words + # and also to fix a bug - the (fail) case. def find(self, wordlist): '''Locate files that match ALL the words in wordlist ''' @@ -152,10 +150,12 @@ entries = {} hits = None for word in wordlist: - if not 2 < len(word) < 25: + if not self.minlength <= len(word) <= self.maxlength: # word outside the bounds of what we index - ignore continue word = word.upper() + if self.is_stopword(word): + continue entry = self.words.get(word) # For each word, get index entries[word] = entry # of matching files if not entry: # Nothing for this one word (fail) Index: roundup/backends/indexer_rdbms.py =================================================================== --- roundup/backends/indexer_rdbms.py (revision 4351) +++ roundup/backends/indexer_rdbms.py (working copy) @@ -66,11 +66,12 @@ # ok, find all the unique words in the text text = unicode(text, "utf-8", "replace").upper() wordlist = [w.encode("utf-8") - for w in re.findall(r'(?u)\b\w{2,25}\b', text)] + for w in re.findall(r'(?u)\b\w{%d,%d}\b' + % (self.minlength, self.maxlength), text)] words = set() for word in wordlist: if self.is_stopword(word): continue - if len(word) > 25: continue + if len(word) > self.maxlength: continue words.add(word) # for each word, add an entry in the db @@ -86,7 +87,9 @@ if not wordlist: return [] - l = [word.upper() for word in wordlist if 26 > len(word) > 2] + l = [word.upper() for word in wordlist + if self.minlength <= len(word) <= self.maxlength] + l = [word for word in l if not self.is_stopword(word)] if not l: return []