Yahoo! 形態素解析 API for Python

Pythonではまだなさそうだったので、作った。

# -*- coding: utf-8 -*-

from urllib import urlopen, urlencode
from lxml import etree
from formencode import Schema
import formencode.validators as validators


ADJECTIVE       =  1 # 形容詞
ADJECTIVAL_NOUN =  2 # 形容動詞
INTERJECTION    =  3 # 感動詞
ADVERB          =  4 # 副詞
ADNOMIAL        =  5 # 連体詞
CONJUNCTION     =  6 # 接続詞
PREFIX          =  7 # 接頭辞
SUFFIX          =  8 # 接尾辞
NOUN            =  9 # 名詞
VERB            = 10 # 動詞
PARTICLE        = 11 # 助詞
AUX_VERB        = 12 # 助動詞
OTHER           = 13 # 特殊 (句読点, カッコ, 記号など)

POS_MIN         =  1
POS_MAX         = 14


def MASchema(response="", uniq=False):
    s = Schema()
    s.filter_extra_fields = True
    s.allow_extra_fields  = True
    s.ignore_key_missing  = True

    response = (response != "") and response.split(",") \
                                or  ["surface", "reading", "pos"]
    for name, validator, default in \
            (("surface",  validators.UnicodeString, ""),
             ("reading",  validators.UnicodeString, ""),
             ("pos",      validators.UnicodeString, ""),
             ("baseform", validators.UnicodeString, ""),
             ("feature",  validators.UnicodeString, "")):
        if name in response:
            s.add_field(name, validator(if_missing=default))
    if uniq:
        s.add_field("count", validators.Int(if_missing=0))

    return s


def _strip_namespace(tag, ns):
    if tag.startswith("{%s}" % ns):
        return tag[len(ns)+2:]
    else:
        return tag


class WebMA(object):
    base_url  = 'http://api.jlp.yahoo.co.jp/MAService/V1/parse'
    namespace = 'urn:yahoo:jp:jlp'

    default_app_id        = 'Yahoo! MAService API for Python'
    default_response      = "surface,reading,pos,baseform,feature"
    default_uniq_response = "surface,reading,pos,baseform,feature,count"

    def __init__(self, app_id=None, uniq=False, uniq_by_baseform=False,
                 filter=None, response=None, validator=None):
        self.app_id           = app_id or self.default_app_id
        self.uniq             = uniq
        self.uniq_by_baseform = uniq_by_baseform
        self.filter           = filter or ""
        self.response         = response
        self.validator        = validator

        if self.response is None:
            self.response = self.uniq and self.default_uniq_response \
                                      or  self.default_response
    def make_params(self, sentence, filter=None):
        if filter is None:
            filter = self.filter
        d = dict(appid=self.app_id,
                 results=self.uniq and "uniq" or "ma",
                 filter=filter,
                 response=self.response,
                 sentence=sentence,)
        if self.uniq:
            d["uniq_by_baseform"] = self.uniq_by_baseform
        return urlencode(d)

    def parse(self, sentence, response=None, filter=None, use_post=True,
              urlopen=urlopen):
        if isinstance(sentence, unicode):
            sentence = sentence.encode('utf-8')
        else:
            # caller must provide utf-8 encoded string for 'sentence'
            pass

        params = self.make_params(sentence, filter)
        if use_post:
            url = self.base_url
            data = params
        else:
            url = self.base_url + '?' + params
            data = None
        et = etree.parse(urlopen(url, data))

        root_tag = _strip_namespace(et.getroot().tag, self.namespace)
        if root_tag != "ResultSet":
            msgs = []
            if root_tag == "Error":
                msgs = et.xpath("./ns:Message", {'ns': self.namespace})
            raise IOError(len(msgs) > 0 and msgs[0].text \
                                        or  "something wrong")

        if response is None:
            response = self.response

        validator = self.validator or \
                    MASchema(response, self.uniq)

        for word in et.xpath(".//ns:word_list/ns:word", {'ns': self.namespace}):
            d = {}
            for e in word:
                d[_strip_namespace(e.tag, self.namespace)] = e.text
            yield validator.to_python(d)


__all__ = ['MASchema', 'WebMA']
for name, value in globals().items():
    if name.isupper() and isinstance(value, int):
        __all__.append(name)


if __name__ == '__main__':
    import sys, locale
    encoding = locale.getpreferredencoding() or \
               sys.getdefaultencoding()
    ma = WebMA()
    for line in sys.stdin:
        for w in ma.parse(line.rstrip().decode(encoding)):
            print "%(surface)s\t%(feature)s" % w
        print "EOS"

テスト:

% echo "すもももももももものうち" | python webma.py 
すもも  名詞,名詞,*,すもも,すもも,すもも
も      助詞,係助詞,*,も,も,も
もも    動詞,マ五,未然ウ接続,もも,もも,もむ
もも    動詞,マ五,未然ウ接続,もも,もも,もむ
も      助詞,係助詞,*,も,も,も
の      助詞,助詞連体化,*,の,の,の
うち    名詞,名詞,*,うち,うち,うち
EOS

ちなみに同じ入力に対するMeCabの出力は、

% echo "すもももももももものうち" | mecab
すもも  名詞,一般,*,*,*,*,すもも,スモモ,スモモ
も      助詞,係助詞,*,*,*,*,も,モ,モ
もも    名詞,一般,*,*,*,*,もも,モモ,モモ
も      助詞,係助詞,*,*,*,*,も,モ,モ
もも    名詞,一般,*,*,*,*,もも,モモ,モモ
の      助詞,連体化,*,*,*,*,の,ノ,ノ
うち    名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
EOS