#!python # vim: encoding=latin1 ts=8 sts=4 et si sw=4 ft=python """ Module ranges.py: evaluate search expressions This module intends to support simplified filtering expressions for numerical data. Since 'and' is used differently in everyday language than in mathematics, no 'and' functionality is implemented. Thus, values and sub-expressions can be reordered for simplification and optimization purposes. """ __author__ = 'Tobias Herp ' __usage__ = """ as a Python module: from ranges import make_expression e = make_expression(expression_string) filtered_numbers = filter(e.filter_func(), numbers) Or try it in a shell: ranges.py "{expression}" [...] In this case, the filter functions are applied to the range of the integer numbers 1 to 10. Example expressions: "3 to 5", "[3; 6)", "(1; 5) or [7; 9]"; no 'and' operator is implemented.\ """ VERSION = '.'.join(map(str, (0, 1, # initial version ))) OPENING_PARANTHESES = '([{' CLOSING_PARANTHESES = '}])' PARENTHESES = tuple(OPENING_PARANTHESES + CLOSING_PARANTHESES) KEYWORDS = tuple('or to'.split()) PARENTH, NUMBER, LETTER, WHITESPACE, SEMICOLON, COMMA = range(6) LONER_TYPES = (PARENTH, SEMICOLON, COMMA,) NON_NUMBERS = tuple(';,') + KEYWORDS ENUM, RANGE = range(1, 3) NORMAL_TOKEN = [ # for normalization None, {';': 'or', # ENUM ',': 'or', 'or': 'or', 'to': 'to', }, {';': 'to', # RANGE ',': 'to', 'or': 'or', 'to': 'to', }, ] CHARSET = dict() # --> prepare_charsets() PARMATCH = dict() PARMATCH['['] = tuple('])') PARMATCH['('] = PARMATCH['['] PARMATCH['{'] = ('}',) PARMATCH[None] = (None,) OPERATOR = dict() OPERATOR['['] = '<=' OPERATOR['('] = '<' OPERATOR[']'] = '<=' OPERATOR[')'] = '<' class ExpressionError(Exception): """\ Some error occurred evaluating the expression """ class ParenthesesError(ExpressionError): "Some error about parentheses" class ParenthesesMismatch(ParenthesesError): pass class ParenthesesLeftover(ParenthesesError): "more closing than opening parentheses" class ParenthesesOpen(ParenthesesError): "more opening than closing parentheses" class ProgrammingError(ExpressionError): pass class NotSupportedError(ExpressionError): pass class CharacterError(NotSupportedError, ValueError): pass class AmbiguityError(NotSupportedError): "expression not supported for ambiguity" ''' def pretokenize(s): """ only cares for parentheses """ buf = list() for ch in s: if ch in PARENTHESES: if buf: yield ''.join(buf) del buf[:] # buf = list() yield ch else: buf.append(ch) if buf: yield ''.join(buf) ''' def prepare_charsets(): from string import digits, whitespace, ascii_letters global CHARSET for ch in PARENTHESES: CHARSET[ch] = PARENTH for ch in digits: CHARSET[ch] = NUMBER for ch in '+-.': CHARSET[ch] = NUMBER for ch in ascii_letters: CHARSET[ch] = LETTER for ch in whitespace: CHARSET[ch] = WHITESPACE CHARSET[','] = COMMA CHARSET[';'] = SEMICOLON prepare_charsets() def tokenize(s): """ takes care of all types of tokens """ prevtype = None buf = list() for ch in s+' ': try: thistype = CHARSET[ch] except KeyError: raise CharacterError('Unsupported character: %r' % ch) if thistype in LONER_TYPES: if buf: if prevtype is WHITESPACE: yield ' ' else: yield ''.join(buf) del buf[:] yield ch prevtype = thistype elif thistype == prevtype: buf.append(ch) else: if buf: if prevtype is WHITESPACE: yield ' ' else: yield ''.join(buf) del buf[:] buf.append(ch) prevtype = thistype class Expression: """ - supports nesting - values a used directly rather than treated as sub-expressions An Expression - has a normalized form - can be represented by a boolean function which takes one value - can be expressed as an SQL while clause """ def __init__(self, parenth): """ parenth -- the opening parenthesis (may be None, for the root expression) """ self.consider_parenth(parenth) self._subexpressions = list() self._tokens = list() self._dirty = False self._subs_dirty = False def consider_parenth(self, parenth): self._opening_parenth = parenth self._closing_parenth = None if not (parenth is None or parenth in OPENING_PARANTHESES): raise ProgrammingError('%r is not an opening parenthesis' % parenth) self._type = None if parenth in ('(', '['): self._default_type = RANGE elif parenth in ('{', None): self._default_type = ENUM if parenth == '{': self._type = ENUM else: raise ProgrammingError('%r not considered' % parenth) def _simple_value(self): """ if the expression has a simple value, return it; otherwise return None """ self._cleanup_tokens() self._cleanup_subexpressions() if self._subexpressions: return tokens = self._tokens if len(tokens) > 1: return None elif not tokens: return None elif token[0] in NON_NUMBERS: return None else: return token[0] def _cleanup_subexpressions(self): """ some subexpressions might be empty or replacable by a single value """ if not self._subs_dirty: return subs = list(self._subexpressions) tokens = self._tokens filtered_subs = list() changed = 0 for sub in subs: if not sub.normalized(): changed = 1 continue sv = sub._simple_value() if sv is not None: if tokens: self.feed('or') self.feed(sv) changed = 1 del sub else: filtered_subs.append(sub) if changed: self._subexpressions = filtered_subs self._subs_dirty = 0 def _cleanup_tokens(self): """ normalize tokens, e.g. eliminating whitespace. """ if self._dirty: prev_a_number = None changed = 0 tokens = self._tokens filtered = list() # eliminate whitespace; ',' -> ';'; insert 'or' as needed: for this in tokens: if this == ' ': raise ProgrammingError('Oh, bugger! (2)') changed = 1 continue non_number = this in NON_NUMBERS a_number = not non_number if a_number and prev_a_number: filtered.append('or') changed = 1 if this == ',': filtered.append(';') changed = 1 else: filtered.append(this) prev_a_number = a_number if changed: self._tokens = tokens = filtered changed = 0 self._dirty = 0 def _set_type(self): """ sets the type """ if self._type is None: if self._subexpressions: self._type = ENUM elif 'to' in self._tokens: self._type = RANGE elif 'or' in self._tokens: self._type = ENUM else: self._type = self._default_type def _freeze(self): self._cleanup_subexpressions() self._cleanup_tokens() self._set_type() def feed(self, token): """ token -- a token (a string) """ tokens = self._tokens # identity! if token == ' ': # and not tokens: return self._dirty = 1 if token in CLOSING_PARANTHESES: self._closing_parenth = token return if token in NON_NUMBERS: tokens.append(token) elif isinstance(token, (str, unicode)): try: tokens.append(int(token)) except ValueError: try: tokens.append(float(token)) except ValueError: raise NotSupportedError('Unsupported token: %r' % token) else: tokens.append(token) if tokens is not self._tokens: self._tokens = tokens def add_sub(self, expr): """ expr -- an Expression object Subexpressions are hold in a separate list; this way simple values can be evaluated first """ assert isinstance(expr, Expression) self._subexpressions.append(expr) ''' def normalized(self): """ return a human-readable, normalized form of the Expression (and all sub-expressions). This function is supposed to help in cases of invalid expressions; for this reason it doesn't simply jerk for invalid input but tries to do its best to give a hint about the error. This function won't find every error; to be sure your expression works, use the python_expression method (which is in turn called by filter_func) """ self._freeze() tokens = self._tokens subs = self._subexpressions if not tokens and not subs: return '' res = [] # excludes the parentheses; joined with ' ' if self._type is RANGE: return '' if len(tokens) == 3 and token[1] == ';': token[1] = 'to' if len(tokens) == 3 and token[1] == 'to': res.extend(tokens) else: idx = 0 tmpl = list() tmptok = list(tokens) num_found = 0 nn_found = 0 prev_num = None while idx < 3: tok = tmptok.pop(0) this_num = not tok in NON_NUMBERS if this_num: num_found += 1 if prev_num: tmpl.append(' ') tmpl.append(tok) it prev_num: break else: nn_found += 1 if not idx: #1st token: open range idx += 1 elif prev_num: if tok in KEYWORDS: tmpl.append(' ') else: tmpl.append(' ') tmpl.append(tok+' ') prev_num = this_num idx += 1 res.extend(''.join(map(str, tmpl))) if tmpl: # should be consumed! res.extend('') elif self._subexpressions: res.extend('') else: # ENUM type for tok in tokens: if tok == ';': res.append('or') else: res.append(str(tok)) # tokens are consumed for se in self._subexpressions: nse = se.normalized() if not nse: raise ProgrammingError('Bugger! (3)') if res: res.append('or') res.append(nse) return '%s%s%s' % ( self._opening_parenth or '', ' '.join(res), self._closing_parenth or '', ) ''' def _python_expression(self): self._freeze() if not self._closing_parenth in PARMATCH[self._opening_parenth]: raise ParenthesesMismatch('parentheses don\'t match (%r, %r)' % (self._opening_parenth, self._closing_parenth)) if self._type is ENUM: return self._enum_expression() elif self._type is RANGE: return self._range_expression() else: raise ProgrammingError( 'type %r not supported' % self._type) def _enum_expression(self): "to be called by _python_expression only" res = list() expr = list() for tok in self._tokens: if tok in NON_NUMBERS: if tok == 'to': raise AmbiguityError('token %r is not allowed' ' in enumerations' % tok) else: res.append(str(tok)) if res: expr.append('x in (%s,)' % ','.join(res)) del res[:] expr.extend(['(%s)' % se._python_expression() for se in self._subexpressions]) return ' or '.join(expr) or '1' def _range_expression(self): res = list() tokens = list(self._tokens) idx = 0 prev_number = None while idx < 3: try: tok = tokens.pop(0) except IndexError: break this_number = not tok in NON_NUMBERS if not idx: # 1st token if this_number: res.append('%s %s' % ( tok, OPERATOR[self._opening_parenth or '['])) res.append('x') else: idx += 1 elif this_number: # not the 1st token! if prev_number: raise ExpressionError('adjacent numbers not allowed' ' in range expressions (%s)' % tok) if not 'x' in res: res.append('x') res.append('%s %s' % ( OPERATOR[self._closing_parenth or ']'], tok)) elif prev_number is 0: raise ExpressionError('adjacent operators not allowed (%s)' % tok) idx += 1 prev_number = this_number if tokens: raise ExpressionError('surplus tokens %r' % tokens) return ' '.join(res) or '1' def python_expression(self): return self._python_expression() def filter_func(self): f = eval('lambda x:'+self._python_expression()) return f s = '\n\t'.join(('def f(x):', 'return '+ self._python_expression())) eval(s) return f def make_expression(s): """ take a string and return an Expression object s -- a string, intended for filtering numerical values """ root = Expression(None) stack = [root] for token in tokenize(s+' '): if token in OPENING_PARANTHESES: e = Expression(token) stack[-1].add_sub(e) stack.append(e) else: stack[-1].feed(token) if token in CLOSING_PARANTHESES: try: stack.pop() except IndexError: raise ParenthesesLeftover() if len(stack) > 1: raise ParenthesesOpen('parentheses open at end of expression') elif not stack: raise ProgrammingError('Should have raised a ParenthesesMismatch') return root def evaluate(s): """ take a string, tokenize it, and return - a string, containing a normalized version of the expression - a function to filter() a sequence of values """ root = make_expression(s) return root.normalized(), root.filter_func() if __name__ == '__main__': # I have my reasons ;-) kwargs = dict(version='%prog '+VERSION, usage=__usage__) import optparse parser = optparse.OptionParser(**kwargs) (option, args) = parser.parse_args() POOL = range(1, 11) for e in args: eo = make_expression(e) print '%s --> %s' % (e, filter(eo.filter_func(), POOL)) print ' (Python expression: %s)' % eo.python_expression()