2011/06/13

Automatic detection of japanese character encoding with python

> sudo aptitude install python-dev
> sudo easy_install pykf

def get_file_encode(input_path):
    """ get japanese encoding information from file using pykf """
    encode = None

    enc_ja = [pykf.EUC, pykf.SJIS, pykf.UTF8, pykf.JIS]
    edic = {pykf.UNKNOWN:None, pykf.ASCII:'ASCII', pykf.SJIS:'SHIFT-JIS',
           pykf.EUC:'EUC-JP', pykf.JIS:'ISO-2022-JP', pykf.UTF8:'UTF-8',
           pykf.UTF16:'utf-16', pykf.UTF16_BE:'utf-16_be',pykf.ERROR:None}

    input_file = open(input_path)
    for line in input_file:
        c = pykf.guess(line)
        if [e for e in enc_ja if e == c] != []:
            encode = enc[c]
            break

    input_file.close()
    return encode

No comments:

Post a Comment

100