neologd辞書が解析前に適用するテキスト正規化処理

#mecab-ipadic-neologd

https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja

辞書データを生成する際には以下で述べる正規化処理を全て適用しているため、解析対象のテキストに対して以下の正規化処理を適用すると、辞書中の語とマッチしやすくなる。

str.translate

以下の全角記号は半角記号に置換

以下の半角記号は全角記号に置換

re.compile

unicodedata.normalizeする文字列を絞っている（Patternのmatch）

re.split

正規化処理

「ひらがな・全角カタカナ・半角カタカナ・漢字・全角記号」間に含まれる半角スペースは削除

「ひらがな・全角カタカナ・半角カタカナ・漢字・全角記号」と「半角英数字」の間に含まれる半角スペースは削除

正規表現を使って文章中の半角スペースを削除できる

Pattern.search & re.sub & Python reの後方参照

ひらがななどの範囲：基本多言語面（BMP）（『文字コード入門』）

ひらがな、カタカナ、CJK統合漢字

code:normalize_neologd.py

# encoding: utf8

from __future__ import unicode_literals

import re

import unicodedata

def unicode_normalize(cls, s):

pt = re.compile('({}+)'.format(cls))

def norm(c):

return unicodedata.normalize('NFKC', c) if pt.match(c) else c

s = ''.join(norm(x) for x in re.split(pt, s))

s = re.sub('－', '-', s)

return s

def remove_extra_spaces(s):

s = re.sub(' 　+', ' ', s)

blocks = ''.join(('\u4E00-\u9FFF', # CJK UNIFIED IDEOGRAPHS

'\u3040-\u309F', # HIRAGANA

'\u30A0-\u30FF', # KATAKANA

'\u3000-\u303F', # CJK SYMBOLS AND PUNCTUATION

'\uFF00-\uFFEF' # HALFWIDTH AND FULLWIDTH FORMS

))

basic_latin = '\u0000-\u007F'

def remove_space_between(cls1, cls2, s):

p = re.compile('({}) ({})'.format(cls1, cls2))

while p.search(s):

s = p.sub(r'\1\2', s)

return s

s = remove_space_between(blocks, blocks, s)

s = remove_space_between(blocks, basic_latin, s)

s = remove_space_between(basic_latin, blocks, s)

return s

def normalize_neologd(s):

s = s.strip()

s = unicode_normalize('０-９Ａ-Ｚａ-ｚ｡-ﾟ', s)

def maketrans(f, t):

return {ord(x): ord(y) for x, y in zip(f, t)}

s = re.sub('˗֊‐‑‒–⁃⁻₋−+', '-', s) # normalize hyphens

s = re.sub('﹣－ｰ—―─━ー+', 'ー', s) # normalize choonpus

s = re.sub('~∼∾〜〰～', '', s) # remove tildes

s = s.translate(

maketrans('!"#$%&\'()*+,-./:;<=>?@¥^_`{|}~｡､･｢｣',

'！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」'))

s = remove_extra_spaces(s)

s = unicode_normalize('！”＃＄％＆’（）＊＋，－．／：；＜＞？＠［￥］＾＿｀｛｜｝〜', s) # keep ＝,・,「,」

s = re.sub('’', '\'', s)

s = re.sub('”', '"', s)

return s

if __name__ == "__main__":

assert "0123456789" == normalize_neologd("０１２３４５６７８９")

assert "ABCDEFGHIJKLMNOPQRSTUVWXYZ" == normalize_neologd("ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ")

assert "abcdefghijklmnopqrstuvwxyz" == normalize_neologd("ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ")

assert "!\"#$%&'()*+,-./:;<>?@¥^_`{|}" == normalize_neologd("！”＃＄％＆’（）＊＋，－．／：；＜＞？＠［￥］＾＿｀｛｜｝")

assert "＝。、・「」" == normalize_neologd("＝。、・「」")

assert "ハンカク" == normalize_neologd("ﾊﾝｶｸ")

assert "o-o" == normalize_neologd("o₋o")

assert "majikaー" == normalize_neologd("majika━")

assert "わい" == normalize_neologd("わ〰い")

assert "スーパー" == normalize_neologd("スーパーーーー")

assert "!#" == normalize_neologd("!#")

assert "ゼンカクスペース" == normalize_neologd("ゼンカク　スペース")

assert "おお" == normalize_neologd("おお")

assert "おお" == normalize_neologd(" おお")

assert "おお" == normalize_neologd("おお ")

assert "検索エンジン自作入門を買いました!!!" == \

normalize_neologd("検索エンジン自作入門を買いました!!!")

assert "アルゴリズムC" == normalize_neologd("アルゴリズム C")

assert "PRML副読本" == normalize_neologd("　　　ＰＲＭＬ　　副　読　本　　　")

assert "Coding the Matrix" == normalize_neologd("Coding the Matrix")

assert "南アルプスの天然水Sparking Lemonレモン一絞り" == \

normalize_neologd("南アルプスの　天然水　Ｓｐａｒｋｉｎｇ　Ｌｅｍｏｎ　レモン一絞り")

assert "南アルプスの天然水-Sparking*Lemon+レモン一絞り" == \

normalize_neologd("南アルプスの　天然水-　Ｓｐａｒｋｉｎｇ*　Ｌｅｍｏｎ+　レモン一絞り")