#!python
# vim: encoding=latin1 ts=8 sts=4 et si sw=4 ft=python
"""
Module ranges.py: evaluate search expressions

This module intends to support simplified filtering expressions
for numerical data.  Since 'and' is used differently in everyday
language than in mathematics, no 'and' functionality is implemented.
Thus, values and sub-expressions can be reordered for simplification
and optimization purposes.
"""

__author__ = 'Tobias Herp <tobias.herp@gmx.net>'

__usage__ = """ as a Python module:

from ranges import make_expression
e = make_expression(expression_string)
filtered_numbers = filter(e.filter_func(), numbers)

Or try it in a shell:

ranges.py "{expression}" [...]

In this case, the filter functions are applied to the range
of the integer numbers 1 to 10.
Example expressions: "3 to 5", "[3; 6)", "(1; 5) or [7; 9]";
no 'and' operator is implemented.\
"""

VERSION = '.'.join(map(str,
                       (0,
                        1, # initial version
                        )))

OPENING_PARANTHESES = '([{'
CLOSING_PARANTHESES = '}])'
PARENTHESES = tuple(OPENING_PARANTHESES + CLOSING_PARANTHESES)
KEYWORDS = tuple('or to'.split())
PARENTH, NUMBER, LETTER, WHITESPACE, SEMICOLON, COMMA = range(6)
LONER_TYPES = (PARENTH, SEMICOLON, COMMA,)
NON_NUMBERS = tuple(';,') + KEYWORDS
ENUM, RANGE = range(1, 3)
NORMAL_TOKEN = [        # for normalization
        None,
        {';':   'or',   # ENUM
         ',':   'or',
         'or':  'or',
         'to':  'to',
         },
        {';':   'to',   # RANGE
         ',':   'to',
         'or':  'or',
         'to':  'to',
         },
        ]

CHARSET = dict() # --> prepare_charsets()
PARMATCH = dict()
PARMATCH['['] = tuple('])')
PARMATCH['('] = PARMATCH['[']
PARMATCH['{'] = ('}',)
PARMATCH[None] = (None,)
OPERATOR = dict()
OPERATOR['['] = '<='
OPERATOR['('] = '<'
OPERATOR[']'] = '<='
OPERATOR[')'] = '<'

class ExpressionError(Exception):
    """\
    Some error occurred evaluating the expression
    """
class ParenthesesError(ExpressionError):
    "Some error about parentheses"
class ParenthesesMismatch(ParenthesesError):
    pass
class ParenthesesLeftover(ParenthesesError):
    "more closing than opening parentheses"
class ParenthesesOpen(ParenthesesError):
    "more opening than closing parentheses"
class ProgrammingError(ExpressionError):
    pass
class NotSupportedError(ExpressionError):
    pass
class CharacterError(NotSupportedError, ValueError):
    pass
class AmbiguityError(NotSupportedError):
    "expression not supported for ambiguity"

'''
def pretokenize(s):
    """
    only cares for parentheses
    """
    buf = list()
    for ch in s:
        if ch in PARENTHESES:
            if buf:
                yield ''.join(buf)
                del buf[:]
                # buf = list()
            yield ch
        else:
            buf.append(ch)
    if buf:
        yield ''.join(buf)
'''

def prepare_charsets():
    from string import digits, whitespace, ascii_letters
    global CHARSET
    for ch in PARENTHESES:
        CHARSET[ch] = PARENTH
    for ch in digits:
        CHARSET[ch] = NUMBER
    for ch in '+-.':
        CHARSET[ch] = NUMBER
    for ch in ascii_letters:
        CHARSET[ch] = LETTER
    for ch in whitespace:
        CHARSET[ch] = WHITESPACE
    CHARSET[','] = COMMA
    CHARSET[';'] = SEMICOLON

prepare_charsets()

def tokenize(s):
    """
    takes care of all types of tokens
    """
    prevtype = None
    buf = list()
    for ch in s+' ':
        try:
            thistype = CHARSET[ch]
        except KeyError:
            raise CharacterError('Unsupported character: %r' % ch)
        if thistype in LONER_TYPES:
            if buf:
                if prevtype is WHITESPACE:
                    yield ' '
                else:
                    yield ''.join(buf)
                del buf[:]
            yield ch
            prevtype = thistype
        elif thistype == prevtype:
            buf.append(ch)
        else:
            if buf:
                if prevtype is WHITESPACE:
                    yield ' '
                else:
                    yield ''.join(buf)
                del buf[:]
            buf.append(ch)
            prevtype = thistype

class Expression:
    """
    - supports nesting
    - values a used directly rather than treated as sub-expressions

    An Expression
    - has a normalized form
    - can be represented by a boolean function which takes one value
    - can be expressed as an SQL while clause
    """

    def __init__(self, parenth):
        """
        parenth -- the opening parenthesis
                   (may be None, for the root expression)
        """
        self.consider_parenth(parenth)
        self._subexpressions = list()
        self._tokens = list()
        self._dirty = False
        self._subs_dirty = False 

    def consider_parenth(self, parenth):
        self._opening_parenth = parenth
        self._closing_parenth = None
        if not (parenth is None or parenth in OPENING_PARANTHESES):
            raise ProgrammingError('%r is not an opening parenthesis'
                                   % parenth)
        self._type = None
        if parenth in ('(', '['):
            self._default_type = RANGE
        elif parenth in ('{', None):
            self._default_type = ENUM
            if parenth == '{':
                self._type = ENUM
        else:
            raise ProgrammingError('%r not considered' % parenth)

    def _simple_value(self):
        """
        if the expression has a simple value, return it;
        otherwise return None
        """
        self._cleanup_tokens()
        self._cleanup_subexpressions()
        if self._subexpressions:
            return
        tokens = self._tokens
        if len(tokens) > 1:
            return None
        elif not tokens:
            return None
        elif token[0] in NON_NUMBERS:
            return None
        else:
            return token[0]

    def _cleanup_subexpressions(self):
        """
        some subexpressions might be empty or replacable
        by a single value
        """
        if not self._subs_dirty:
            return
        subs = list(self._subexpressions)
        tokens = self._tokens
        filtered_subs = list()
        changed = 0
        for sub in subs:
            if not sub.normalized():
                changed = 1
                continue
            sv = sub._simple_value()
            if sv is not None:
                if tokens:
                    self.feed('or')
                self.feed(sv)
                changed = 1
                del sub
            else:
                filtered_subs.append(sub)
        if changed:
            self._subexpressions = filtered_subs
        self._subs_dirty = 0 


    def _cleanup_tokens(self):
        """
        normalize tokens, e.g. eliminating whitespace.
        """
        if self._dirty:
            prev_a_number = None
            changed = 0
            tokens = self._tokens
            filtered = list()
            # eliminate whitespace; ',' -> ';'; insert 'or' as needed:
            for this in tokens:
                if this == ' ':
                    raise ProgrammingError('Oh, bugger! (2)')
                    changed = 1
                    continue
                non_number = this in NON_NUMBERS
                a_number = not non_number
                if a_number and prev_a_number:
                    filtered.append('or')
                    changed = 1
                if this == ',':
                    filtered.append(';')
                    changed = 1
                else:
                    filtered.append(this)
                prev_a_number = a_number
            if changed:
                self._tokens = tokens = filtered
                changed = 0
            self._dirty = 0

    def _set_type(self):
        """
        sets the type
        """
        if self._type is None:
            if self._subexpressions:
                self._type = ENUM
            elif 'to' in self._tokens:
                self._type = RANGE
            elif 'or' in self._tokens:
                self._type = ENUM
            else:
                self._type = self._default_type

    def _freeze(self):
        self._cleanup_subexpressions()
        self._cleanup_tokens()
        self._set_type()

    def feed(self, token):
        """
        token -- a token (a string)
        """
        tokens = self._tokens # identity!
        if token == ' ': # and not tokens:
            return
        self._dirty = 1
        if token in CLOSING_PARANTHESES:
            self._closing_parenth = token
            return
        if token in NON_NUMBERS:
            tokens.append(token)
        elif isinstance(token, (str, unicode)):
            try:
                tokens.append(int(token))
            except ValueError:
                try:
                    tokens.append(float(token))
                except ValueError:
                    raise NotSupportedError('Unsupported token: %r' % token)
        else:
            tokens.append(token)
        if tokens is not self._tokens:
            self._tokens = tokens

    def add_sub(self, expr):
        """
        expr -- an Expression object

        Subexpressions are hold in a separate list; this way
        simple values can be evaluated first
        """
        assert isinstance(expr, Expression)
        self._subexpressions.append(expr)

    '''
    def normalized(self):
        """
        return a human-readable, normalized form of the Expression
        (and all sub-expressions).

        This function is supposed to help in cases of invalid
        expressions; for this reason it doesn't simply jerk for
        invalid input but tries to do its best to give a hint
        about the error.

        This function won't find every error; to be sure your
        expression works, use the python_expression method (which
        is in turn called by filter_func)
        """
        self._freeze()
        tokens = self._tokens
        subs = self._subexpressions
        if not tokens and not subs:
            return ''
        res = [] # excludes the parentheses; joined with ' '
        if self._type is RANGE:
            return '<range dummy>'
            if len(tokens) == 3 and token[1] == ';':
                token[1] = 'to'
            if len(tokens) == 3 and token[1] == 'to':
                res.extend(tokens)
            else:
                idx = 0
                tmpl = list()
                tmptok = list(tokens)
                num_found = 0
                nn_found = 0
                prev_num = None
                while idx < 3:
                    tok = tmptok.pop(0)
                    this_num = not tok in NON_NUMBERS
                    if this_num:
                        num_found += 1
                        if prev_num:
                            tmpl.append(' <err> ')
                        tmpl.append(tok)
                        it prev_num:
                            break
                    else:
                        nn_found += 1
                        if not idx: #1st token: open range
                            idx += 1
                        elif prev_num:
                            if tok in KEYWORDS:
                                tmpl.append(' ')
                        else:
                            tmpl.append(' <err> ')
                        tmpl.append(tok+' ')
                    prev_num = this_num
                    idx += 1
                res.extend(''.join(map(str, tmpl)))
                if tmpl: # should be consumed!
                    res.extend('<err>')
                elif self._subexpressions:
                    res.extend('<err>')
        else:               # ENUM type
            for tok in tokens:
                if tok == ';':
                    res.append('or')
                else:
                    res.append(str(tok))
        # tokens are consumed
        for se in self._subexpressions:
            nse = se.normalized()
            if not nse:
                raise ProgrammingError('Bugger! (3)')
            if res:
                res.append('or')
            res.append(nse)
        return '%s%s%s' % (
                self._opening_parenth or '',
                ' '.join(res),
                self._closing_parenth or '',
                )
    '''

    def _python_expression(self):
        self._freeze()
        if not self._closing_parenth in PARMATCH[self._opening_parenth]:
            raise ParenthesesMismatch('parentheses don\'t match (%r, %r)'
                                      % (self._opening_parenth,
                                         self._closing_parenth))
        if self._type is ENUM:
            return self._enum_expression()
        elif self._type is RANGE:
            return self._range_expression()
        else:
            raise ProgrammingError(
                    'type %r not supported' % self._type)

    def _enum_expression(self):
        "to be called by _python_expression only"
        res = list()
        expr = list()
        for tok in self._tokens:
            if tok in NON_NUMBERS:
                if tok == 'to':
                    raise AmbiguityError('token %r is not allowed'
                                         ' in enumerations' % tok)
            else:
                res.append(str(tok))
        if res:
            expr.append('x in (%s,)' % ','.join(res))
            del res[:]
        expr.extend(['(%s)' % se._python_expression()
                     for se in self._subexpressions])
        return ' or '.join(expr) or '1'

    def _range_expression(self):
        res = list()
        tokens = list(self._tokens)
        idx = 0
        prev_number = None
        while idx < 3:
            try:
                tok = tokens.pop(0)
            except IndexError:
                break
            this_number = not tok in NON_NUMBERS
            if not idx:             # 1st token
                if this_number:
                    res.append('%s %s' % (
                        tok, OPERATOR[self._opening_parenth or '[']))
                    res.append('x')
                else:
                    idx += 1
            elif this_number:       # not the 1st token!
                if prev_number:
                    raise ExpressionError('adjacent numbers not allowed'
                                          ' in range expressions (%s)'
                                          % tok)
                if not 'x' in res:
                    res.append('x')
                res.append('%s %s' % (
                    OPERATOR[self._closing_parenth or ']'], tok))
            elif prev_number is 0:
                raise ExpressionError('adjacent operators not allowed (%s)'
                                      % tok)
            idx += 1
            prev_number = this_number
        if tokens:
            raise ExpressionError('surplus tokens %r' % tokens)
        return ' '.join(res) or '1'

    def python_expression(self):
        return self._python_expression()

    def filter_func(self):
        f = eval('lambda x:'+self._python_expression())
        return f
        s = '\n\t'.join(('def f(x):',
                         'return '+ self._python_expression()))
        eval(s)
        return f

def make_expression(s):
    """
    take a string and return an Expression object

    s -- a string, intended for filtering numerical values
    """
    root = Expression(None)
    stack = [root]
    for token in tokenize(s+' '):
        if token in OPENING_PARANTHESES:
            e = Expression(token)
            stack[-1].add_sub(e)
            stack.append(e)
        else:
            stack[-1].feed(token)
            if token in CLOSING_PARANTHESES:
                try:
                    stack.pop()
                except IndexError:
                    raise ParenthesesLeftover()
    if len(stack) > 1:
        raise ParenthesesOpen('parentheses open at end of expression')
    elif not stack:
        raise ProgrammingError('Should have raised a ParenthesesMismatch')
    return root

def evaluate(s):
    """
    take a string, tokenize it, and return

    - a string, containing a normalized version of the expression
    - a function to filter() a sequence of values
    """
    root = make_expression(s)
    return root.normalized(), root.filter_func()

if __name__ == '__main__':
    # I have my reasons ;-) 
    kwargs = dict(version='%prog '+VERSION,
                  usage=__usage__)
    import optparse
    parser = optparse.OptionParser(**kwargs)
    (option, args) = parser.parse_args()

    POOL = range(1, 11)
    for e in args:
        eo = make_expression(e)
        print '%s  -->  %s' % (e, filter(eo.filter_func(), POOL))
        print '    (Python expression: %s)' % eo.python_expression()