티스토리 뷰

Programming/python

PYTHON 인코딩 dictionary

람람뿅뿅 2017. 6. 12. 13:50

웹사이트 크롤링하다 인코딩 매번 달라서 짜증나서 만든 encoding_dictionary



참고 사이트: https://docs.python.org/2.4/lib/standard-encodings.html


● 사용법


아래 dictionary 선언을 복사해서 적당히 변수명붙여주세요.


requests 패키지를 사용했을때, 서버로부터 response 받은것을


response.encoding 을 입력하면 string으로 해당 response의 인코딩 정보가 나옵니다.


이것을 dictionary 키값으로 보내 결과를 받으면 됩니다.


저같은경우, pandas로 테이블 만들때나, beautifulsoup 객체 변환해줄때 텍스트 인코딩에 넣어버립니다.


$ response.text().encode(encoding_map[response.encoding])


● encoding dictionary


encoding_map = {'437': 'cp437', '646': 'ascii', '850': 'cp850', '852': 'cp852', '855': 'cp855', '857': 'cp857', '860': 'cp860', '861': 'cp861', '862': 'cp862', '863': 'cp863', '865': 'cp865', '866': 'cp866', '869': 'cp869', '8859': 'latin_1', '932': 'cp932', '936': 'gbk', '949': 'cp949', '950': 'cp950', 'CP-GR': 'cp869', 'CP-IS': 'cp861', 'EBCDIC-CP-BE': 'cp500', 'EBCDIC-CP-CH': 'cp500', 'EBCDIC-CP-HE': 'cp424', 'IBM037': 'cp037', 'IBM039': 'cp037', 'IBM424': 'cp424', 'IBM437': 'cp437', 'IBM500': 'cp500', 'IBM775': 'cp775', 'IBM850': 'cp850', 'IBM852': 'cp852', 'IBM855': 'cp855', 'IBM857': 'cp857', 'IBM860': 'cp860', 'IBM861': 'cp861', 'IBM862': 'cp862', 'IBM863': 'cp863', 'IBM864': 'cp864', 'IBM865': 'cp865', 'IBM866': 'cp866', 'IBM869': 'cp869', 'L1': 'latin_1', 'L2': 'iso8859_2', 'L3': 'iso8859_3', 'L4': 'iso8859_4', 'L5': 'iso8859_9', 'L6': 'iso8859_10', 'L8': 'iso8859_14', 'U16': 'utf_16', 'U7': 'utf_7', 'U8': 'utf_8', 'UTF': 'utf_8', 'UTF-16BE': 'utf_16_be', 'UTF-16LE': 'utf_16_le', 'arabic': 'iso8859_6', 'big5-hkscs': 'big5hkscs', 'big5-tw': 'big5', 'chinese': 'gb2312', 'cp1361': 'johab', 'cp154': 'ptcp154', 'cp819': 'latin_1', 'cp936': 'gbk', 'csbig5': 'big5', 'csiso2022jp': 'iso2022_jp', 'csiso2022kr': 'iso2022_kr', 'csiso58gb231280': 'gb2312', 'csptcp154': 'ptcp154', 'csshiftjis': 'shift_jis', 'cyrillic': 'iso8859_5', 'cyrillic-asian': 'ptcp154', 'euc-cn': 'gb2312', 'euccn': 'gb2312', 'eucgb2312-cn': 'gb2312', 'eucjis2004': 'euc_jis_2004', 'eucjisx0213': 'euc_jisx0213', 'eucjp': 'euc_jp', 'euckr': 'euc_kr', 'gb18030-2000': 'gb18030', 'gb2312-1980': 'gb2312', 'gb2312-80': 'gb2312', 'greek': 'iso8859_7', 'greek8': 'iso8859_7', 'hebrew': 'iso8859_8', 'hkscs': 'big5hkscs', 'hz-gb': 'hz', 'hz-gb-2312': 'hz', 'hzgb': 'hz', 'ibm1026': 'cp1026', 'ibm1140': 'cp1140', 'iso-2022-jp': 'iso2022_jp', 'iso-2022-jp-1': 'iso2022_jp_1', 'iso-2022-jp-2': 'iso2022_jp_2', 'iso-2022-jp-2004': 'iso2022_jp_2004', 'iso-2022-jp-3': 'iso2022_jp_3', 'iso-2022-jp-ext': 'iso2022_jp_ext', 'iso-2022-kr': 'iso2022_kr', 'iso-8859-1': 'latin_1', 'iso-8859-10': 'iso8859_10', 'iso-8859-13': 'iso8859_13', 'iso-8859-14': 'iso8859_14', 'iso-8859-15': 'iso8859_15', 'iso-8859-2': 'iso8859_2', 'iso-8859-3': 'iso8859_3', 'iso-8859-4': 'iso8859_4', 'iso-8859-5': 'iso8859_5', 'iso-8859-6': 'iso8859_6', 'iso-8859-7': 'iso8859_7', 'iso-8859-8': 'iso8859_8', 'iso-8859-9': 'iso8859_9', 'iso-ir-58': 'gb2312', 'iso2022jp': 'iso2022_jp', 'iso2022jp-1': 'iso2022_jp_1', 'iso2022jp-2': 'iso2022_jp_2', 'iso2022jp-2004': 'iso2022_jp_2004', 'iso2022jp-3': 'iso2022_jp_3', 'iso2022jp-ext': 'iso2022_jp_ext', 'iso2022kr': 'iso2022_kr', 'iso8859-1': 'latin_1', 'jisx0213': 'euc_jis_2004', 'korean': 'euc_kr', 'ks_c-5601': 'euc_kr', 'ks_c-5601-1987': 'euc_kr', 'ks_x-1001': 'euc_kr', 'ksc5601': 'euc_kr', 'ksx1001': 'euc_kr', 'latin': 'latin_1', 'latin1': 'latin_1', 'latin2': 'iso8859_2', 'latin3': 'iso8859_3', 'latin4': 'iso8859_4', 'latin5': 'iso8859_9', 'latin6': 'iso8859_10', 'latin8': 'iso8859_14', 'maccentraleurope': 'mac_latin2', 'maccyrillic': 'mac_cyrillic', 'macgreek': 'mac_greek', 'maciceland': 'mac_iceland', 'maclatin2': 'mac_latin2', 'macroman': 'mac_roman', 'macturkish': 'mac_turkish', 'ms-kanji': 'cp932', 'ms1361': 'johab', 'ms932': 'cp932', 'ms936': 'gbk', 'ms949': 'cp949', 'ms950': 'cp950', 'mskanji': 'cp932', 'pt154': 'ptcp154', 's_jis': 'shift_jis', 's_jisx0213': 'shift_jisx0213', 'shiftjis': 'shift_jis', 'shiftjis2004': 'shift_jis_2004', 'shiftjisx0213': 'shift_jisx0213', 'sjis': 'shift_jis', 'sjis2004': 'shift_jis_2004', 'sjis_2004': 'shift_jis_2004', 'sjisx0213': 'shift_jisx0213', 'u-jis': 'euc_jp', 'uhc': 'cp949', 'ujis': 'euc_jp', 'us-ascii': 'ascii', 'utf16': 'utf_16', 'utf8': 'utf_8', 'windows-1250': 'cp1250', 'windows-1251': 'cp1251', 'windows-1252': 'cp1252', 'windows-1253': 'cp1253', 'windows-1254': 'cp1254', 'windows-1255': 'cp1255', 'windows-1257': 'cp1257', 'windows-1258': 'cp1258', 'windows1256': 'cp1256',

}


제작코드

import requests

import pandas as pd

response = requests.get("https://docs.python.org/2.4/lib/standard-encodings.html")

encoding_df  = pd.read_html(response.text)[1]

encoding_map = {}                                                               # result 

for idx in range(len(encoding_df)):

    try:

        codec = encoding_df.ix[idx]

        key_v = [i.strip(" ") for i in codec.Aliases.split(',')]

        val_v = [codec.Codec]

        for k, v in zip(key_v, val_v * len(key_v)):

            encoding_map[k] = v

    except(AttributeError):

        pass


'Programming > python' 카테고리의 다른 글

[Python] subprocess를 사용한 병렬 프로그래밍 - (1)  (1) 2017.08.03