# -*- coding: utf-8 -*- # normalizer_sample.py (for Python 2) # # Coded by Takuya Kadowaki # # This software is released under the MIT License. # # http://opensource.org/licenses/mit-license.php # import sys import htmlentitydefs import re import unicodedata import zenhan class Normalizer(object): def __init__(self, char_path): # 全角 JIS X 0208 の文字群 self.char_set = set() for line in open(char_path): self.char_set.add(line[:-1].decode('utf-8')) def __del__(self): pass def main(self, input_path, output_path): num = 0 fp = open(output_path, 'w') for line in open(input_path): line = line.replace('\n', '') # <--必要に応じて改行コードを変更 norm_line = self.check(line.decode('utf-8')) if norm_line == -1: print 'ERROR: Input text is not unicode.' sys.exit() fp.write(norm_line.encode('utf-8') + '\n') # <--必要に応じて改行コードを変更 fp.close() def check(self, text): """textを全角 JIS X 0208で構成されるように変換・除去し,返す """ if type(text) != type(u''): return -1 text2 = self.htmlentity2unicode(self.htmlentity2unicode(text)) text_norm = unicodedata.normalize('NFKC', text2) text_zen = zenhan.h2z(text_norm) zyokyo_list = [] for zen in text_zen: if zen not in self.char_set: zyokyo_list.append(zen) for zyokyo in zyokyo_list: text_zen = text_zen.replace(zyokyo, '') # 除去 return text_zen def htmlentity2unicode(self, text): """実体参照と文字参照を通常の文字に変換し返す <参考元URL> http://www.programming-magic.com/20080820002254/ """ # 正規表現のコンパイル reference_regex = re.compile(u'&(#x?[0-9a-f]+|[a-z]+);', re.IGNORECASE) num16_regex = re.compile(u'#x\d+', re.IGNORECASE) num10_regex = re.compile(u'#\d+', re.IGNORECASE) result = u'' i = 0 while True: # 実体参照 or 文字参照を見つける match = reference_regex.search(text, i) if match is None: result += text[i:] break result += text[i:match.start()] i = match.end() name = match.group(1) # 実体参照 if name in htmlentitydefs.name2codepoint.keys(): result += unichr(htmlentitydefs.name2codepoint[name]) # 文字参照 elif num16_regex.match(name): # 16進数 result += unichr(int(u'0' + name[1:], 16)) elif num10_regex.match(name): # 10進数 result += unichr(int(name[1:])) return result if __name__ == '__main__': argvs = sys.argv argc = len(argvs) if argc != 4: print '以下の書式で実行してください' print 'python normalizer_sample.py [JIS X 0208文字一覧ファイル] [解析したい手順文書のパス] [出力先のパス]' sys.exit() char_path = argvs[1] input_path = argvs[2] output_path = argvs[3] n = Normalizer(char_path) n.main(input_path, output_path) print 'DONE!!'