Quick Start Guide for KoreanCodecs ---------------------------------- $Id: quick_start.txt,v 1.2 2002/04/28 09:10:09 perky Exp $ (this document has EUC-KR characters) 1. Installation * Normal $ python setup.py install * Without aliases (without extra path on python2.0) $ python setup.py install --without-aliases * Without C Extensions $ python setup.py install --without-extension 2. Encoding/Decoding * EUC-KR Codec (the most widely-used Korean encoding) >>> unicode("ÇѱÛÀÌ ÁÁ¾Æ.", "euc-kr") u'\ud55c\uae00\uc774 \uc88b\uc544.' >>> print _.encode("euc-kr") ÇѱÛÀÌ ÁÁ¾Æ. * CP949 Codec (yet another widely-used encoding among M$ windows users) >>> unicode("Àå»ç´Â µ·À» ¹ö´Â °ÍÀÌ ¾Æ´Ï¶ó »ç¶÷À» ¹ö´Â °ÍÀ̶ó ÇÏ¿´½À´Ï´Ù ¡Ù ßÂÔ³ ¡Ù", "cp949") u'\uc7a5\uc0ac\ub294 \ub3c8\uc744 ...' >>> print _[-10:].encode("cp949") ½À´Ï´Ù ¡Ù ßÂÔ³ ¡Ù * Johab, Unijohab and ISO-2022-KR Codecs (same way with above described) * Qwerty2bul Codec >>> unicode("¿ø¼þÀÌ ¾ûµ¢ÀÌ´Â »¡°³", "euc-kr") u'\uc6d0\uc22d\uc774 \uc5c9\ub369\uc774\ub294 \ube68\uac1c' >>> _.encode("qwerty2bul") 'dnjstnddl djdejddlsms Qkfro' >>> unicode("Qkfrks rjtdms tkrhk tkrhksms aktdlTdj", "qwerty2bul") u'\ube68\uac04 \uac83\uc740 \uc0ac\uacfc \uc0ac\uacfc\ub294 \ub9db\uc788\uc5b4' >>> print _.encode("euc-kr") »¡°£ °ÍÀº »ç°ú »ç°ú´Â ¸ÀÀÖ¾î 3. StreamReader, StreamWriter >>> import codecs >>> f = codecs.open("quick_start.txt", encoding="euc-kr") >>> lines = f.readlines() >>> len(lines) 103 >>> lines[25] u'2. Encoding/Decoding\n' >>> lines[96] u'>>> print hangul.format(fmt, result=u("\ub7ec\uc2a4\ud2f0\ub124\uc77c"), subj1=u("\uc704\uc2a4\ud0a4"), subj2=u("\ub4dc\ub78c\ubdd4")).encode("euc-kr")\n' >>> f = codecs.open("testing.txt", "w", encoding="qwerty2bul") >>> f.write(unicode("ÇΰÅÈÙ·¹ ¸ÀÀÖ´Ù", "euc-kr")) >>> f.close() >>> open("testing.txt").read() 'vldrjgnlffp aktdlTek' 4. Hangul Module >>> from korean import hangul >>> dir(hangul) ['A', 'AE', 'B', 'BB', 'BS', 'C', 'CHOSUNG_FILLER', ... ] >>> print hangul.DD.encode("euc-kr"), hangul.GS.encode("euc-kr") ¤¨ ¤£ >>> print u', '.join(hangul.Chosung).encode('euc-kr') ¤¡, ¤¢, ¤¤, ¤§, ¤¨, ¤©, ¤±, ¤², ¤³, ¤µ, ¤¶, ¤·, ¤¸, ¤¹, ¤º, ¤», ¤¼, ¤½, ¤¾ >>> hangul.ishangul(u'A') False >>> hangul.ishangul(unicode("ÇÑ", "euc-kr")) True >>> hangul.isJaeum(unicode("Ƽ", "euc-kr")) False >>> hangul.isJaeum(unicode("¤¼", "euc-kr")) True >>> u = lambda x: unicode(x, "euc-kr") >>> print u', '.join(hangul.split(u("Ħ"))).encode("euc-kr") ¤º, ¤Ó, ¤ª >>> print hangul.join([hangul.J, hangul.WA, hangul.L]).encode("euc-kr") ÁÄ >>> print hangul.join([hangul.K, hangul.WAE, hangul.Null]).encode("euc-kr") Äè >>> u("²¿ºÎ¶û ÇҸӴϰ¡ ²¿ºÎ¶û °í°³±æÀ»") u'\uaf2c\ubd80\ub791 ... >>> hangul.disjoint(_) u'\u1101\u1169\u1107\u116e ... >>> hangul.conjoin(_) u'\uaf2c\ubd80\ub791 ... >>> fmt = u("¿ì¸® ¿¹»Û %s(ÀÌ), %sÀÌ ÁÁ¾Æ %sÀÌ ÁÁ¾Æ?") >>> print hangul.format(fmt, u("¾Æ¶ó"), u("¾Æºü"), u("¾ö¸¶")).encode("euc-kr") ¿ì¸® ¿¹»Û ¾Æ¶ó, ¾Æºü°¡ ÁÁ¾Æ ¾ö¸¶°¡ ÁÁ¾Æ? >>> print hangul.format(fmt, u("ÀºÁ¤"), u("¼ú"), u("¹°")).encode("euc-kr") ¿ì¸® ¿¹»Û ÀºÁ¤ÀÌ, ¼úÀÌ ÁÁ¾Æ ¹°ÀÌ ÁÁ¾Æ? >>> fmt = u("%(subj1)s¿Í %(subj2)sÀ» ÇÕÄ¡¸é %(result)s°¡ »ý¼ºµÈ´Ù.") >>> print hangul.format(fmt, result=u("·¯½ºÆ¼³×ÀÏ"), subj1=u("À§½ºÅ°"), subj2=u("µå¶÷ºß")).encode("euc-kr") À§½ºÅ°¿Í µå¶÷ºß¸¦ ÇÕÄ¡¸é ·¯½ºÆ¼³×ÀÏÀÌ »ý¼ºµÈ´Ù. >>> print hangul.format(fmt, subj2=u("¸ÞÄ­´õÅõ"), subj1=u("¸ÞÄ­´õ¿ø"), result=u("66% ¸ÞÄ­´õºêÀÌ")).encode("euc-kr") ¸ÞÄ­´õ¿ø°ú ¸ÞÄ­´õÅõ¸¦ ÇÕÄ¡¸é 66% ¸ÞÄ­´õºêÀ̰¡ »ý¼ºµÈ´Ù. Yes, you got it!