Yahoo! 形態素解析 API for Python
Pythonではまだなさそうだったので、作った。
# -*- coding: utf-8 -*- from urllib import urlopen, urlencode from lxml import etree from formencode import Schema import formencode.validators as validators ADJECTIVE = 1 # 形容詞 ADJECTIVAL_NOUN = 2 # 形容動詞 INTERJECTION = 3 # 感動詞 ADVERB = 4 # 副詞 ADNOMIAL = 5 # 連体詞 CONJUNCTION = 6 # 接続詞 PREFIX = 7 # 接頭辞 SUFFIX = 8 # 接尾辞 NOUN = 9 # 名詞 VERB = 10 # 動詞 PARTICLE = 11 # 助詞 AUX_VERB = 12 # 助動詞 OTHER = 13 # 特殊 (句読点, カッコ, 記号など) POS_MIN = 1 POS_MAX = 14 def MASchema(response="", uniq=False): s = Schema() s.filter_extra_fields = True s.allow_extra_fields = True s.ignore_key_missing = True response = (response != "") and response.split(",") \ or ["surface", "reading", "pos"] for name, validator, default in \ (("surface", validators.UnicodeString, ""), ("reading", validators.UnicodeString, ""), ("pos", validators.UnicodeString, ""), ("baseform", validators.UnicodeString, ""), ("feature", validators.UnicodeString, "")): if name in response: s.add_field(name, validator(if_missing=default)) if uniq: s.add_field("count", validators.Int(if_missing=0)) return s def _strip_namespace(tag, ns): if tag.startswith("{%s}" % ns): return tag[len(ns)+2:] else: return tag class WebMA(object): base_url = 'http://api.jlp.yahoo.co.jp/MAService/V1/parse' namespace = 'urn:yahoo:jp:jlp' default_app_id = 'Yahoo! MAService API for Python' default_response = "surface,reading,pos,baseform,feature" default_uniq_response = "surface,reading,pos,baseform,feature,count" def __init__(self, app_id=None, uniq=False, uniq_by_baseform=False, filter=None, response=None, validator=None): self.app_id = app_id or self.default_app_id self.uniq = uniq self.uniq_by_baseform = uniq_by_baseform self.filter = filter or "" self.response = response self.validator = validator if self.response is None: self.response = self.uniq and self.default_uniq_response \ or self.default_response def make_params(self, sentence, filter=None): if filter is None: filter = self.filter d = dict(appid=self.app_id, results=self.uniq and "uniq" or "ma", filter=filter, response=self.response, sentence=sentence,) if self.uniq: d["uniq_by_baseform"] = self.uniq_by_baseform return urlencode(d) def parse(self, sentence, response=None, filter=None, use_post=True, urlopen=urlopen): if isinstance(sentence, unicode): sentence = sentence.encode('utf-8') else: # caller must provide utf-8 encoded string for 'sentence' pass params = self.make_params(sentence, filter) if use_post: url = self.base_url data = params else: url = self.base_url + '?' + params data = None et = etree.parse(urlopen(url, data)) root_tag = _strip_namespace(et.getroot().tag, self.namespace) if root_tag != "ResultSet": msgs = [] if root_tag == "Error": msgs = et.xpath("./ns:Message", {'ns': self.namespace}) raise IOError(len(msgs) > 0 and msgs[0].text \ or "something wrong") if response is None: response = self.response validator = self.validator or \ MASchema(response, self.uniq) for word in et.xpath(".//ns:word_list/ns:word", {'ns': self.namespace}): d = {} for e in word: d[_strip_namespace(e.tag, self.namespace)] = e.text yield validator.to_python(d) __all__ = ['MASchema', 'WebMA'] for name, value in globals().items(): if name.isupper() and isinstance(value, int): __all__.append(name) if __name__ == '__main__': import sys, locale encoding = locale.getpreferredencoding() or \ sys.getdefaultencoding() ma = WebMA() for line in sys.stdin: for w in ma.parse(line.rstrip().decode(encoding)): print "%(surface)s\t%(feature)s" % w print "EOS"
テスト:
% echo "すもももももももものうち" | python webma.py すもも 名詞,名詞,*,すもも,すもも,すもも も 助詞,係助詞,*,も,も,も もも 動詞,マ五,未然ウ接続,もも,もも,もむ もも 動詞,マ五,未然ウ接続,もも,もも,もむ も 助詞,係助詞,*,も,も,も の 助詞,助詞連体化,*,の,の,の うち 名詞,名詞,*,うち,うち,うち EOS
ちなみに同じ入力に対するMeCabの出力は、
% echo "すもももももももものうち" | mecab すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ も 助詞,係助詞,*,*,*,*,も,モ,モ もも 名詞,一般,*,*,*,*,もも,モモ,モモ も 助詞,係助詞,*,*,*,*,も,モ,モ もも 名詞,一般,*,*,*,*,もも,モモ,モモ の 助詞,連体化,*,*,*,*,の,ノ,ノ うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ EOS