検索式を構文解析するPythonモジュール

細かいチューニングがしたかったのでpyparsingを使って自分で書いた。
一応仕事で使うためのもの(でも間違いなくオーバースペック)。

# -*- coding: utf-8 -*-

#
# Copyright (c) 2007  Nozomu Kaneko
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#

"""Yet another search query parser

This search query parser uses the excellent Pyparsing module 
(http://pyparsing.sourceforge.net/) to parse search queries by users.

You can provide an acceptor to the parser in order to construct parse
results in the specific form.  For example, you can construct
SQLAlchemy `ClauseElement` objects in your acceptor so that search
queries can be translated directly into SQL expressions.


Lots of parts of this script are based on the following scripts:

* aql.py
  http://pyparsing-public.wikispaces.com/space/showimage/aql.py

* searchparser.py
  http://pyparsing.wikispaces.com/space/showimage/searchparser.py


Thanks to the authors of the scripts and the pyparsing module.

"""


from pyparsing import *

def parser():

    _eq_     = Suppress("=")
    _ne_     = Suppress("!=")
    _lt_     = Suppress("<")
    _gt_     = Suppress(">")
    _le_     = Suppress("<=")
    _ge_     = Suppress(">=")

    _lparen_ = Suppress("(")
    _rparen_ = Suppress(")")

    _sign_   = Word("+-", exact=1)
    Int  = Combine(Optional(_sign_) + Word(nums)) \
           .setParseAction(lambda l, s, t: int(t[0])) \
           .setResultsName('int')
    Float = Combine(Optional(_sign_) +
                    ( Word(nums) + "." + Optional(Word(nums))
                    | ("." + Word(nums)) ) + 
                    Optional("E" + Optional(_sign_) + Word(nums))) \
           .setParseAction(lambda l, s, t: float(t[0])) \
           .setResultsName('float')
    Number = Float | Int
    String = quotedString.copy() \
             .setParseAction(removeQuotes) \
             .setResultsName('string')
    Constant = Float | Int | String

    _not_ = Keyword("not", caseless=True).suppress()
    _and_ = Keyword("and", caseless=True).suppress()
    _or_  = Keyword("or", caseless=True).suppress()

    _dotdot_ = Keyword("..").suppress()
    _in_ = Keyword("in", caseless=True).suppress()

    Lval = Word(alphanums + "_.")
    Rval = Float | Int | String | Lval

    RvalList = ( _lparen_
               + Group(delimitedList(Rval)).setResultsName('vals')
               + _rparen_
               )

    Clause = Forward()
    ComplexClause = Forward()

    Range = ( _lparen_
            + Optional(Number, default=None).setResultsName('min')
            + _dotdot_
            + Optional(Number, default=None).setResultsName('max')
            + _rparen_
            )

    Clause << ( Group( Lval + _eq_ + Rval ).setResultsName('eq')
              | Group( Lval + _ne_ + Rval ).setResultsName('ne')
              | Group( Lval + _lt_ + Rval ).setResultsName('lt')
              | Group( Lval + _gt_ + Rval ).setResultsName('gt')
              | Group( Lval + _le_ + Rval ).setResultsName('le')
              | Group( Lval + _ge_ + Rval ).setResultsName('ge')
              | Group( Lval + _eq_ + Range ).setResultsName('range')
              | Group( Lval + _in_ + RvalList ).setResultsName('in')
              | Group( _lparen_ + ComplexClause + _rparen_).setResultsName('paren')
              )

    ComplexClause << \
        ( Group( _not_ + Clause ).setResultsName('not')
        | Group( Clause + _or_ + ComplexClause ).setResultsName('or')
        | Group( Clause + _and_ + ComplexClause ).setResultsName('and')
        | Clause
        )

    BareWord = CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS + '"()')

    Term = Group( String | BareWord ).setResultsName('term')

    Prefix = ( Regex(r"([A-Za-z][A-Za-z0-9_\-\.]*):") \
                   .setParseAction(lambda l, s, t: t[0][:-1])
             | _sign_
             )
    Directive = Group( Prefix + ( String | BareWord )).setResultsName('directive')

    QueryElement = Forward()
    QueryElementList = Forward()

    QueryElement << ( Directive
                    | ComplexClause
                    | ( _lparen_ + QueryElementList + _rparen_ )
                    | Term
                    )

    QueryElementList << \
        ( Group( QueryElement + _or_ + QueryElementList).setResultsName('or')
        | Group( QueryElement + Optional(_and_) + QueryElementList).setResultsName('and')
        | QueryElement
        )

    Query = QueryElementList + stringEnd

    return Query


class SearchQueryParser(object):

    """
    >>> p = SearchQueryParser(NullAcceptor())

    >>> p("a")
    'a'
    >>> p(u"\u30c6\u30b9\u30c8 test *")
    ('and', u'\u30c6\u30b9\u30c8', u'test', u'*')
    >>> p('x y z')
    ('and', 'x', 'y', 'z')
    >>> p('x and y or z')
    ('and', 'x', ('or', 'y', 'z'))
    >>> p('(a or b)')
    ('or', 'a', 'b')
    >>> p('(a or b) and (c or d)')
    ('and', ('or', 'a', 'b'), ('or', 'c', 'd'))
    >>> p('x +y -z')
    ('and', 'x', ('directive', '+', 'y'), ('directive', '-', 'z'))
    >>> p('"x y z"')
    'x y z'
    >>> p(u'\u3042 \u3044 \u3046')
    ('and', u'\u3042', u'\u3044', u'\u3046')
    >>> p(u'\u3042 +\u3044 -\u3046')
    ('and', u'\u3042', ('directive', u'+', u'\u3044'), ('directive', u'-', u'\u3046'))
    >>> p('x = 1.0')
    ('=', 'x', 1.0)
    >>> p('x < 1')
    ('<', 'x', 1)
    >>> p('x = "1"')
    ('=', 'x', '1')
    >>> p('x = ""')
    ('=', 'x', '')

    >>> p('x = 1 and y != 2')
    ('and', ('=', 'x', 1), ('!=', 'y', 2))
    >>> p('z = "foo" and not x = 3')
    ('and', ('=', 'z', 'foo'), ('not', ('=', 'x', 3)))
    >>> p('z >= 3 or z <= 5')
    ('or', ('>=', 'z', 3), ('<=', 'z', 5))
    >>> p('z <= 3 or z >= 4')
    ('or', ('<=', 'z', 3), ('>=', 'z', 4))
    >>> p('x = ( .. )')
    ('range', 'x', None, None)
    >>> p('x = ( 1 .. 3.0 )')
    ('range', 'x', 1, 3.0)
    >>> p('x = ( 1.0 .. )')
    ('range', 'x', 1.0, None)
    >>> p('x in ( 1, 2, 3 ) or z >= 4')
    ('or', ('in', 'x', [1, 2, 3]), ('>=', 'z', 4))
    >>> p('z <= 3 and z = ( 1 .. 2 )')
    ('and', ('<=', 'z', 3), ('range', 'z', 1, 2))
    >>> p('z = "foo" or (z != 3.0 and fiz != "fozz")')
    ('or', ('=', 'z', 'foo'), ('and', ('!=', 'z', 3.0), ('!=', 'fiz', 'fozz')))

    >>> p("foo:val test x=y")
    ('and', ('directive', 'foo', 'val'), 'test', ('=', 'x', 'y'))
    >>> p('test foo:"a b c" x=y')
    ('and', 'test', ('directive', 'foo', 'a b c'), ('=', 'x', 'y'))


    """

    def __init__(self, acceptor):
        self.parser = parser()
        self.acceptor = acceptor

    def __call__(self, query):
        r = self.parser.parseString(query)
        return self.visit(r[0])

    def visit(self, arg):
        methods = {
            'and':   self.acceptor.accept_and,
            'or':    self.acceptor.accept_or,
            'not':   self.acceptor.accept_not,
            'eq':    self.acceptor.accept_eq,
            'ne':    self.acceptor.accept_ne,
            'lt':    self.acceptor.accept_lt,
            'gt':    self.acceptor.accept_gt,
            'le':    self.acceptor.accept_le,
            'ge':    self.acceptor.accept_ge,
            'range': self.acceptor.accept_range,
            'in':    self.acceptor.accept_in,
            'vals':  lambda *kw: list(kw),
            'paren': self.visit,
        }

        if not isinstance(arg, ParseResults):
            return arg

        name = arg.getName()
        if name == 'directive':
            return self.acceptor.accept_directive(*arg)
        elif name == 'term':
            return self.acceptor.accept_term(*arg)
        else:
            f = methods[name]
            return f(*map(self.visit, arg))



class NullAcceptor(object):
    """
    Example acceptor for SearchQueryParser.
    It is also used in doctest of SearchQueryParser.

    You don't have to override this class, but you should define your
    own acceptor with all the `accept_*` methods.
    """

    def accept_and(self, lhs, rhs):
        if isinstance(rhs, tuple) and rhs[0] == 'and':
            return ('and', lhs) + rhs[1:]
        else:
            return ('and', lhs, rhs)

    def accept_or(self, lhs, rhs):
        if isinstance(rhs, tuple) and rhs[0] == 'or':
            return ('or', lhs) + rhs[1:]
        else:
            return ('or', lhs, rhs)

    def accept_not(self, clause):
        return ('not', clause)

    def accept_eq(self, lhs, rhs):
        return ('=', lhs, rhs)

    def accept_ne(self, lhs, rhs):
        return ('!=', lhs, rhs)

    def accept_lt(self, lhs, rhs):
        return ('<', lhs, rhs)

    def accept_gt(self, lhs, rhs):
        return ('>', lhs, rhs)

    def accept_le(self, lhs, rhs):
        return ('<=', lhs, rhs)

    def accept_ge(self, lhs, rhs):
        return ('>=', lhs, rhs)

    def accept_range(self, lval, min, max):
        return ('range', lval, min, max)

    def accept_in(self, lval, rval):
        return ('in', lval, rval)

    def accept_directive(self, lval, rval):
        return ('directive', lval, rval)

    def accept_term(self, term):
        return term


def repl():
    import cmd
    class Repl(cmd.Cmd):
        def __init__(self, **kw):
            cmd.Cmd.__init__(self, **kw)
            self.parser = SearchQueryParser(NullAcceptor())
        def onecmd(self, line):
            if line == '':
                return True
            print self.parser(line)
            return False
    Repl().cmdloop()


if __name__ == '__main__':
    import sys
    if len(sys.argv) == 1:
        repl()
    elif len(sys.argv) > 1 and sys.argv[1] == 'test':
        import doctest
        doctest.testmod()
    else:
        warn("Invalid arguments")


__all__ = [ 'SearchQueryParser', 'NullAcceptor' ]

ソースコード中のコメントにも書いてあるように、検索式から直接SQL文を生成することが最終目標。次は構文解析の結果からSQLAlchemyのClauseElementを生成する部分を書く。