// UnicodeData.m - Charmap application Unicode data // Copyright (C) 2003,2004 Christopher Culver // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. #ifndef _UNICODE_DATA_H_ #include #endif @implementation UnicodeData : NSObject { } - (void) dealloc { blockNames = nil; RELEASE(blockNames); RELEASE(categoriesDict); [super dealloc]; } - (id) init { NSString *path; blockNames = [[NSArray alloc] initWithObjects: @"Basic Latin", @"Latin-1 Supplement", @"Latin Extended-A", @"Latin Extended-B", @"IPA Extensions", @"Spacing Modifier Letters", @"Combining Diacritical Marks", @"Greek and Coptic", @"Cyrillic", @"Cyrillic Supplementary", @"Armenian", @"Hebrew", @"Arabic", @"Syriac", @"Thaana", @"Devanagari", @"Bengali", @"Gurmukhi", @"Gujarati", @"Oriya", @"Tamil", @"Telugu", @"Kannada", @"Malayalam", @"Sinhala", @"Thai", @"Lao", @"Tibetan", @"Myanmar", @"Georgian", @"Hangul Jamo", @"Ethiopic", @"Cherokee", @"Unified Canadian Aboriginal Syllabics", @"Ogham", @"Runic", @"Tagalog", @"Hanunoo", @"Buhid", @"Tagbanwa", @"Khmer", @"Mongolian", @"Limbu", @"Tai Le", @"Khmer Symbols", @"Phonetic Extensions", @"Latin Extended Additional", @"Greek Extended", @"General Punctuation", @"Superscripts and Subscripts", @"Currency Symbols", @"Combining Diacritical Marks for Symbols", @"Letterlike Symbols", @"Number Forms", @"Arrows", @"Mathematical Operators", @"Miscellaneous Technical", @"Control Pictures", @"Optical Character Recognition", @"Enclosed Alphanumerics", @"Box Drawing", @"Block Elements", @"Geometic Shapes", @"Miscellaneous Symbols", @"Dingbats", @"Miscellaneous Mathematical Symbols-A", @"Supplemental Arrows-A", @"Braille Patterns", @"Supplemental Arrows-B", @"Miscellaneous Mathematical Symbols-B", @"Supplemental Mathematical Operators", @"Miscellaneous Symbols and Arrows", @"CJK Radicals Supplement", @"Kangxi Radicals", @"Ideographic Description Characters", @"CJK Symbols and Punctuation", @"Hiragana", @"Katakana", @"Bopomofo", @"Hangul Compatibility Jamo", @"Kanbun", @"Bopomofo Extended", @"Katakana Phonetic Extensions", @"Enclosed CJK Letters and Months", @"CJK Compatibility", @"CJK Unified Ideographs Extensions A", @"Yijing Hexagram Symbols", @"CJK Unified Ideographs", @"Yi Syllables", @"Yi Radicals", @"Hangul Syllables", @"High Surrogates", @"High Private Use Surrogates", @"Low Surrogates", @"Private Use Area", @"CJK Compatibility Ideographs", @"Alphabetic Presentation Forms", @"Arabic Presentation Forms-A", @"Variation Selectors", @"Combining Half Marks", @"CJK Compatibility Forms", @"Small Form Variants", @"Arabic Presentation Forms-B", @"Halfwidth and Fullwidth Forms", @"Specials", /* Commented out until support of higher portions of Unicode is implemented. */ // @"Linear B Syllabary", // @"Linear B Ideograms", // @"Aegean Numbers", // @"Old Italic", // @"Gothic", // @"Ugaritic", // @"Deseret", // @"Shavian", // @"Osmanya", // @"Cypriot Syllabary", // @"Byzantine Musical Symbols", // @"Musical Symbols", // @"Tai Xuan Jing Symbols", // @"Mathematical Alphanumeric Symbols", // @"CJK Unified Ideographs Extensions B", // @"CJK Compatibility Ideographs Supplement", // @"Tags", // @"Variation Selectors Supplement", // @"Supplementary Private Use Area-A", // @"Supplementary Private Use Area-B", nil]; blockOffsets[0] = 0x0000; // Basic Latin blockOffsets[1] = 0x0080; // Latin-1 Supplement blockOffsets[2] = 0x0100; // Latin Extended-A blockOffsets[3] = 0x0180; // Latin Extended-B blockOffsets[4] = 0x0250; // IPA Extensions blockOffsets[5] = 0x02B0; // Spacing Modifier Letters blockOffsets[6] = 0x0300; // Combining Diacritical Marks blockOffsets[7] = 0x0370; // Greek and Coptic blockOffsets[8] = 0x0400; // Cyrillic blockOffsets[9] = 0x0500; // Cyrillic Supplementary blockOffsets[10] = 0x0530; // Armenian blockOffsets[11] = 0x0590; // Hebrew blockOffsets[12] = 0x0600; // Arabic blockOffsets[13] = 0x0700; // Syriac blockOffsets[14] = 0x0780; // Thaana blockOffsets[15] = 0x0900; // Devanagari blockOffsets[16] = 0x0980; // Bengali blockOffsets[17] = 0x0A00; // Gurmukhi blockOffsets[18] = 0x0A80; // Gujarati blockOffsets[19] = 0x0B00; // Oriya blockOffsets[20] = 0x0B80; // Tamil blockOffsets[21] = 0x0C00; // Telugu blockOffsets[22] = 0x0C80; // Kannada blockOffsets[23] = 0x0D00; // Malayalam blockOffsets[24] = 0x0D80; // Sinhala blockOffsets[25] = 0x0E00; // Thai blockOffsets[26] = 0x0E80; // Lao blockOffsets[27] = 0x0F00; // Tibetan blockOffsets[28] = 0x1000; // Myanmar blockOffsets[29] = 0x10A0; // Georgian blockOffsets[30] = 0x1100; // Hangul Jamo blockOffsets[31] = 0x1200; // Ethiopic blockOffsets[32] = 0x13A0; // Cherokee blockOffsets[33] = 0x1400; // Unified Canadian Aboriginal Syllabics blockOffsets[34] = 0x1680; // Ogham blockOffsets[35] = 0x16A0; // Runic blockOffsets[36] = 0x1700; // Tagalog blockOffsets[37] = 0x1720; // Hanunoo blockOffsets[38] = 0x1740; // Buhid blockOffsets[39] = 0x1760; // Tagbanwa blockOffsets[40] = 0x1780; // Khmer blockOffsets[41] = 0x1800; // Mongolian blockOffsets[42] = 0x1900; // Limbu blockOffsets[43] = 0x1950; // Tai Le blockOffsets[44] = 0x19E0; // Khmer Symbols blockOffsets[45] = 0x1D00; // Phonetic Extensions blockOffsets[46] = 0x1E00; // Latin Extended Additional blockOffsets[47] = 0x1F00; // Greek Extended blockOffsets[48] = 0x2000; // General Punctuation blockOffsets[49] = 0x2070; // Superscripts and Subscripts blockOffsets[50] = 0x20A0; // Currency Symbols blockOffsets[51] = 0x20D0; // Combining Diacritical Marks for Symbols blockOffsets[52] = 0x2100; // Letterlike Symbols blockOffsets[53] = 0x2150; // Number Forms blockOffsets[54] = 0x2190; // Arrows blockOffsets[55] = 0x2200; // Mathematical Operators blockOffsets[56] = 0x2300; // Miscellaneous Technical blockOffsets[57] = 0x2400; // Control Pictures blockOffsets[58] = 0x2440; // Optical Character Recognitio blockOffsets[59] = 0x2460; // Enclosed Alphanumerics blockOffsets[60] = 0x2500; // Box Drawing blockOffsets[61] = 0x2580; // Block Elements blockOffsets[62] = 0x25A0; // Geometric Shapes blockOffsets[63] = 0x2600; // Miscellaneous Symbols blockOffsets[64] = 0x2700; // Dingbats blockOffsets[65] = 0x27C0; // Miscellaneous Mathematical Symbols-A blockOffsets[66] = 0x27F0; // Supplemental Arrows-A blockOffsets[67] = 0x2800; // Braille Patterns blockOffsets[68] = 0x2900; // Supplemental Arrows-B blockOffsets[69] = 0x2980; // Miscellaneous Mathematical Symbols-B blockOffsets[70] = 0x2A00; // Supplemental Mathematical Operators blockOffsets[71] = 0x2B00; // Miscellaneous Symbols and Arrows blockOffsets[72] = 0x2E80; // CJK Radicals Supplement blockOffsets[73] = 0x2F00; // Kangxi Radicals blockOffsets[74] = 0x2FF0; // Ideographic Description Characters blockOffsets[75] = 0x3000; // CJK Symbols and Punctuation blockOffsets[76] = 0x3040; // Hiragana blockOffsets[77] = 0x30A0; // Katakana blockOffsets[78] = 0x3100; // Bopomofo blockOffsets[79] = 0x3130; // Hangul Compatibility Jamo blockOffsets[80] = 0x3190; // Kanbun blockOffsets[81] = 0x31A0; // Bopomofo Extended blockOffsets[82] = 0x31F0; // Katakana Phonetic Extensions blockOffsets[83] = 0x3200; // Enclosed CJK Letters and Months blockOffsets[84] = 0x3300; // CJK Compatibility blockOffsets[85] = 0x3400; // CJK Unified Ideographs Extension A blockOffsets[86] = 0x4DC0; // Yijing Hexagram Symbols blockOffsets[87] = 0x4E00; // CJK Unified Ideographs blockOffsets[88] = 0xA000; // Yi Syllables blockOffsets[89] = 0xA490; // Yi Radicals blockOffsets[90] = 0xAC00; // Hangul Syllables blockOffsets[91] = 0xD800; // High Surrogates blockOffsets[92] = 0xDB80; // High Private Use Surrogates blockOffsets[93] = 0xDC00; // Low Surrogates blockOffsets[94] = 0xE000; // Private Use Area blockOffsets[95] = 0xF900; // CJK Compatibility Ideographs blockOffsets[96] = 0xFB00; // Alphabetic Presentation Forms blockOffsets[97] = 0xFB50; // Arabic Presentation Forms-A blockOffsets[98] = 0xFE00; // Variation Selectors blockOffsets[99] = 0xFE20; // Combining Half Marks blockOffsets[100] = 0xFE30; // CJK Compatibility Forms blockOffsets[101] = 0xFE50; // Small Form Variants blockOffsets[102] = 0xFE70; // Arabic Presentation Forms-B blockOffsets[103] = 0xFF00; // Halfwidth and Fullwidth Forms blockOffsets[104] = 0xFFF0; // Specials blockOffsets[105] = 0x10000; // Linear B Syllabary blockOffsets[106] = 0x10080; // Linear B Ideograms blockOffsets[107] = 0x10100; // Aegean Numbers blockOffsets[108] = 0x10300; // Old Italic blockOffsets[109] = 0x10330; // Gothic blockOffsets[110] = 0x10380; // Ugaritic blockOffsets[111] = 0x10400; // Deseret blockOffsets[112] = 0x10450; // Shavian blockOffsets[113] = 0x10480; // Osmanya blockOffsets[114] = 0x10800; // Cypriot Syllabary blockOffsets[115] = 0x1D000; // Byzantine Musical Symbols blockOffsets[116] = 0x1D100; // Musical Symbols blockOffsets[117] = 0x1D300; // Tai Xuan Jing Symbols blockOffsets[118] = 0x1D400; // Mathematical Alphanumeric Symbols blockOffsets[119] = 0x20000; // CJK Unified Ideographs Extension B blockOffsets[120] = 0x2F800; // CJK Compatibility Ideographs Supplement blockOffsets[121] = 0xE0000; // Tags blockOffsets[122] = 0xE0100; // Variation Selectors Supplement blockOffsets[123] = 0xF0000; // Supplementary Private Use Area-A blockOffsets[124] = 0x100000; // Supplementary Private Use Area-B blockOffsets[125] = (0x10FFFF + 1); // END categoriesDict = [[NSDictionary alloc] initWithObjectsAndKeys: @"Other, Control", @"Cc", @"Other, Format", @"Cf", @"Letter, Lowercase", @"Ll", @"Letter, Modifier", @"Lm", @"Letter, Other", @"Lo", @"Letter, Uppercase", @"Lu", @" Mark, Spacing Combining", @"Mc", @"Mark, Non-Spacing", @"Mn", @"Number, Decimal Digit", @"Nd", @"Number, Other", @"No", @"Punctuation, Connector", @"Pc", @"Punctuation, Dash", @"Pd", @"Punctuation, Closed", @"Pe", @"Punctuation, Other", @"Po", @"Punctuation, Open", @"Ps", @"Symbol, Currency", @"Sc", @"Symbol, Modifier", @"Sk", @"Symbol, Math", @"Sm", @"Symbol, Other", @"So", @"Separator, Space", @"Zs", nil]; path = [[NSBundle mainBundle] pathForResource: @"UnicodeData" ofType: @"txt"]; unicodeDataFile = fopen([path fileSystemRepresentation],"rb"); fseek(unicodeDataFile, 0, SEEK_END); unicodeDataLength = ftell(unicodeDataFile); unihanFile = fopen("Unihan.txt","rb"); if (unihanFile) { fseek(unihanFile, 0, SEEK_END); unihanLength = ftell(unihanFile); } return self; } - (NSArray *) getBlockNames { return blockNames; } - (int) getBlockSize: (int) unicodeBlock { return ((blockOffsets[unicodeBlock+1]) - blockOffsets[unicodeBlock]); } - (long) getBlockStart: (int) unicodeBlock { return (blockOffsets[unicodeBlock]); } - (long) getBlockEnd: (int) unicodeBlock { return (blockOffsets[unicodeBlock + 1] - 1); } /* Returns the first 15 characters of a line starting near offset, the offset of the start of the line in real_offset, and the length of the line. */ static const char *get_start_of_line_near(FILE *f, unsigned int offset, unsigned int *real_offset, unsigned int *length) { static char buf[16]; unsigned int ofs; int i; int ch; for (ofs = offset; ofs>0 ; ) { if (ofs > 16) ofs -= 16; else { ofs = 0; break; } fseek(f, ofs, SEEK_SET); fread(buf, 1, 16, f); for (i = 15; i >= 0; i--) if (buf[i] == '\n') break; if (i >= 0) { ofs = ofs + i + 1; break; } } /* the line starts at ofs */ *real_offset = ofs; i = 0; fseek(f, ofs, SEEK_SET); *length = 0; while (1) { ch = fgetc(f); if (ch == '\n' || ch == -1) break; if (i < 15) buf[i++] = ch; (*length)++; } buf[i] = 0; return buf; } -(NSDictionary *) unihanDictionaryForCharacter: (unsigned int)number { static int start_of_data=-1; FILE *f = unihanFile; int first_line =- 1; if (start_of_data == -1) { // Find where initial comments end and real data starts int prev_ch,ch; fseek(f, 0, SEEK_SET); prev_ch = fgetc(f); while (1) { ch = fgetc(f); if (prev_ch == '\n' && ch != '#') break; prev_ch = ch; } start_of_data = ftell(f) - 1; } { unsigned int lo, hi, mid; unsigned int real, len; unsigned int char_index; const char *buf; int cache_index; lo = start_of_data; hi = unihanLength; cache_index = 1; while (1) { if (cache_index < UnihanCacheSize && han_cache_len[cache_index]) { char_index = han_cache_char[cache_index]; real = han_cache_real[cache_index]; len = han_cache_len[cache_index]; } else { mid = (lo+hi) / 2; buf = get_start_of_line_near(f, mid, &real, &len); char_index = strtol(buf + 2, NULL, 16); if (cache_index < UnihanCacheSize) { han_cache_char[cache_index] = char_index; han_cache_real[cache_index] = real; han_cache_len[cache_index] = len; } } if (char_index == number) { first_line = real; hi = real; cache_index = cache_index * 2 + 1; } else if (char_index < number) { lo = real + len + 1; cache_index = cache_index * 2; } else if (char_index > number) { hi = real; cache_index = cache_index * 2 + 1; } if (lo == hi) break; } if (first_line == -1) return nil; } { unsigned int real,len,pos; unsigned int char_index; const char *buf; NSMutableDictionary *dict = [[NSMutableDictionary alloc] init]; pos = first_line; while (1) { buf = get_start_of_line_near(f, pos, &real, &len); char_index = strtol(buf + 2, NULL, 16); if (char_index != number) break; fseek(f, pos, SEEK_SET); { unsigned char big_buf[len+1]; unsigned char *c; NSString *key, *value; fread(big_buf, 1, len, f); big_buf[len] = 0; c = strchr(big_buf, '\t') + 1; c = strchr(c, '\t') + 1; value = [[NSString alloc] initWithUTF8String: c]; c = strchr(big_buf, '\t') + 1; *strchr(c, '\t') = 0; key = [[NSString alloc] initWithUTF8String: c]; [dict setObject: value forKey: key]; } pos = real + len + 1; } return [dict autorelease]; } } - (NSDictionary *) dictionaryForCharacter: (int) number { FILE *f = unicodeDataFile; unsigned int lo, hi, mid; unsigned int real, len; unsigned int char_index; const char *buf; int cache_index; if ((number >= 0x3400 && number < 0xa000) || (number >= 0xf900 && number < 0xfb00) || (number >= 0x20000 && number < 0x30000)) { if (unihanFile) return [self unihanDictionaryForCharacter: number]; return nil; } lo = 0; hi = unicodeDataLength; cache_index = 1; while (1) { if (cache_index < UnicodeDataLookupSize && cache_len[cache_index]) { char_index = cache_char[cache_index]; real = cache_real[cache_index]; len = cache_len[cache_index]; } else { mid = (lo + hi) / 2; buf = get_start_of_line_near(f, mid, &real, &len); char_index = strtol(buf, NULL, 16); if (cache_index < UnicodeDataLookupSize) { cache_char[cache_index] = char_index; cache_real[cache_index] = real; cache_len[cache_index] = len; } } if (char_index == number) break; else if (char_index < number) { lo = real + len + 1; cache_index = cache_index * 2; } else if (char_index > number) { hi = real; cache_index = cache_index * 2 + 1; } if (lo == hi) return nil; } /* Extract all the fields and build a neat dictionary. */ { NSArray *parts; char buf[len+1]; fseek(f, real, SEEK_SET); fread(buf, 1, len, f); buf[len]=0; parts = [[NSString stringWithUTF8String: buf] componentsSeparatedByString: @";"]; return [NSDictionary dictionaryWithObjectsAndKeys: [parts objectAtIndex: 1], @"Name", [parts objectAtIndex: 2],@"GeneralCategory", [parts objectAtIndex: 3],@"CanonicalCombiningClass", [parts objectAtIndex: 4],@"BidirectionalCategory", [parts objectAtIndex: 5],@"Decomposition", [parts objectAtIndex: 6],@"DecimalDigitValue", [parts objectAtIndex: 7],@"DigitValue", [parts objectAtIndex: 8],@"NumericValue", [parts objectAtIndex: 9],@"Mirrored", [parts objectAtIndex: 10],@"Unicode1.0Name", [parts objectAtIndex: 11],@"ISO10646Comment", nil]; } } - (NSString *) getName: (int) number { NSDictionary *d = [self dictionaryForCharacter: number]; NSString *name; /* For CJK characters, use kDefinition. */ if ([d objectForKey: @"kDefinition"]) name = [[d objectForKey: @"kDefinition"] retain]; else if ([d objectForKey: @"Name"]) name = [[d objectForKey: @"Name"] retain]; else if (number > 0xE000 && number < 0xF8FF) name = @""; else if (number >= 0x3400 && number <= 0x4DB5) name = @""; else if (number >= 0x4E00 && number <= 0x9FA5) name = @""; else name = @""; return name; } - (NSString *) getAlias: (int) number { NSDictionary *d = [self dictionaryForCharacter: number]; return [[d objectForKey: @"Unicode1.0Name"] retain]; } - (NSString *) getFullCategoryName: (NSString *) abbreviation { return ([categoriesDict objectForKey: abbreviation]); } - (NSString *) getFullDecomposition: (int) number { NSArray *disregard; NSArray *parts; NSDictionary *d; NSMutableString *fullDecomp; NSString *s; NSString *name = nil; short int i = 0, j = 0; int charValue; fullDecomp = [[NSMutableString alloc] init]; d = [self dictionaryForCharacter: number]; s = [[NSString alloc] initWithString: [d objectForKey: @"Decomposition"]]; parts = [[NSString stringWithUTF8String: [s UTF8String]] componentsSeparatedByString: @" "]; disregard = [[NSArray alloc] initWithObjects: @"", @"", @"", @"", nil]; for (j = 0; j < [parts count]; j++) { if ([disregard containsObject: [parts objectAtIndex: j]]) i = (i + 1); } while(i < [parts count]) { [fullDecomp appendString: [NSString stringWithFormat: @"U+%s ", [[parts objectAtIndex: i] cString]]]; charValue = strtol([[parts objectAtIndex: i] cString], NULL, 16); name = [[NSString alloc] initWithString: [self getName: charValue]]; [fullDecomp appendString: [NSString stringWithFormat: @"%s ", [name cString]]]; if ([parts count] > (i + 1)) [fullDecomp appendString: [NSString stringWithString: @"+ "]]; i++; } return fullDecomp; } @end