/* * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this * file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_LICENSE_HEADER_END@ */ /* CFStringUtilities.c Copyright 1999-2002, Apple, Inc. All rights reserved. Responsibility: Aki Inoue */ #include "CFInternal.h" #include "CFStringEncodingConverterExt.h" #include "CFUniChar.h" #include #include #if defined(__MACH__) || defined(__LINUX__) #include #elif defined(__WIN32__) #include #include #endif Boolean CFStringIsEncodingAvailable(CFStringEncoding theEncoding) { switch (theEncoding) { case kCFStringEncodingASCII: // Built-in encodings case kCFStringEncodingMacRoman: case kCFStringEncodingUTF8: case kCFStringEncodingNonLossyASCII: case kCFStringEncodingWindowsLatin1: case kCFStringEncodingNextStepLatin: case kCFStringEncodingUTF16: case kCFStringEncodingUTF16BE: case kCFStringEncodingUTF16LE: case kCFStringEncodingUTF32: case kCFStringEncodingUTF32BE: case kCFStringEncodingUTF32LE: return true; default: return CFStringEncodingIsValidEncoding(theEncoding); } } const CFStringEncoding* CFStringGetListOfAvailableEncodings() { return CFStringEncodingListOfAvailableEncodings(); } CFStringRef CFStringGetNameOfEncoding(CFStringEncoding theEncoding) { static CFMutableDictionaryRef mappingTable = NULL; CFStringRef theName = mappingTable ? CFDictionaryGetValue(mappingTable, (const void*)theEncoding) : NULL; if (!theName) { switch (theEncoding) { case kCFStringEncodingUTF8: theName = CFSTR("Unicode (UTF-8)"); break; case kCFStringEncodingUTF16: theName = CFSTR("Unicode (UTF-16)"); break; case kCFStringEncodingUTF16BE: theName = CFSTR("Unicode (UTF-16BE)"); break; case kCFStringEncodingUTF16LE: theName = CFSTR("Unicode (UTF-16LE)"); break; case kCFStringEncodingUTF32: theName = CFSTR("Unicode (UTF-32)"); break; case kCFStringEncodingUTF32BE: theName = CFSTR("Unicode (UTF-32BE)"); break; case kCFStringEncodingUTF32LE: theName = CFSTR("Unicode (UTF-32LE)"); break; case kCFStringEncodingNonLossyASCII: theName = CFSTR("Non-lossy ASCII"); break; default: { const uint8_t *encodingName = CFStringEncodingName(theEncoding); if (encodingName) { theName = CFStringCreateWithCString(NULL, encodingName, kCFStringEncodingASCII); } } break; } if (theName) { if (!mappingTable) mappingTable = CFDictionaryCreateMutable(NULL, 0, (const CFDictionaryKeyCallBacks *)NULL, &kCFTypeDictionaryValueCallBacks); CFDictionaryAddValue(mappingTable, (const void*)theEncoding, (const void*)theName); CFRelease(theName); } } return theName; } CFStringEncoding CFStringConvertIANACharSetNameToEncoding(CFStringRef charsetName) { static CFMutableDictionaryRef mappingTable = NULL; CFStringEncoding result = kCFStringEncodingInvalidId; CFMutableStringRef lowerCharsetName; /* Check for common encodings first */ if (CFStringCompare(charsetName, CFSTR("utf-8"), kCFCompareCaseInsensitive) == kCFCompareEqualTo) { return kCFStringEncodingUTF8; } else if (CFStringCompare(charsetName, CFSTR("iso-8859-1"), kCFCompareCaseInsensitive) == kCFCompareEqualTo) { return kCFStringEncodingISOLatin1; } /* Create lowercase copy */ lowerCharsetName = CFStringCreateMutableCopy(NULL, 0, charsetName); CFStringLowercase(lowerCharsetName, NULL); if (mappingTable == NULL) { CFMutableDictionaryRef table = CFDictionaryCreateMutable(NULL, 0, &kCFTypeDictionaryKeyCallBacks, (const CFDictionaryValueCallBacks *)NULL); const CFStringEncoding *encodings = CFStringGetListOfAvailableEncodings(); while (*encodings != kCFStringEncodingInvalidId) { const char **nameList = CFStringEncodingCanonicalCharsetNames(*encodings); if (nameList) { while (*nameList) { CFStringRef name = CFStringCreateWithCString(NULL, *nameList++, kCFStringEncodingASCII); if (name) { CFDictionaryAddValue(table, (const void*)name, (const void*)*encodings); CFRelease(name); } } } encodings++; } // Adding Unicode names CFDictionaryAddValue(table, (const void*)CFSTR("unicode-1-1"), (const void*)kCFStringEncodingUTF16); CFDictionaryAddValue(table, (const void*)CFSTR("iso-10646-ucs-2"), (const void*)kCFStringEncodingUTF16); CFDictionaryAddValue(table, (const void*)CFSTR("utf-16"), (const void*)kCFStringEncodingUTF16); CFDictionaryAddValue(table, (const void*)CFSTR("utf-16be"), (const void*)kCFStringEncodingUTF16BE); CFDictionaryAddValue(table, (const void*)CFSTR("utf-16le"), (const void*)kCFStringEncodingUTF16LE); CFDictionaryAddValue(table, (const void*)CFSTR("utf-32"), (const void*)kCFStringEncodingUTF32); CFDictionaryAddValue(table, (const void*)CFSTR("utf-32be"), (const void*)kCFStringEncodingUTF32BE); CFDictionaryAddValue(table, (const void*)CFSTR("utf-32le"), (const void*)kCFStringEncodingUTF32LE); mappingTable = table; } if (CFDictionaryContainsKey(mappingTable, (const void*)lowerCharsetName)) { result = (CFStringEncoding)CFDictionaryGetValue(mappingTable, (const void*)lowerCharsetName); } CFRelease(lowerCharsetName); return result; } CFStringRef CFStringConvertEncodingToIANACharSetName(CFStringEncoding encoding) { static CFMutableDictionaryRef mappingTable = NULL; CFStringRef theName = mappingTable ? (CFStringRef)CFDictionaryGetValue(mappingTable, (const void*)encoding) : NULL; if (!theName) { switch (encoding) { case kCFStringEncodingUTF16: theName = CFSTR("UTF-16"); break; case kCFStringEncodingUTF16BE: theName = CFSTR("UTF-16BE"); break; case kCFStringEncodingUTF16LE: theName = CFSTR("UTF-16LE"); break; case kCFStringEncodingUTF32: theName = CFSTR("UTF-32"); break; case kCFStringEncodingUTF32BE: theName = CFSTR("UTF-32BE"); break; case kCFStringEncodingUTF32LE: theName = CFSTR("UTF-32LE"); break; default: { const char **nameList = CFStringEncodingCanonicalCharsetNames(encoding); if (nameList && *nameList) { CFMutableStringRef upperCaseName; theName = CFStringCreateWithCString(NULL, *nameList, kCFStringEncodingASCII); if (theName) { upperCaseName = CFStringCreateMutableCopy(NULL, 0, theName); CFStringUppercase(upperCaseName, 0); CFRelease(theName); theName = upperCaseName; } } } break; } if (theName) { if (!mappingTable) mappingTable = CFDictionaryCreateMutable(NULL, 0, (const CFDictionaryKeyCallBacks *)NULL, &kCFTypeDictionaryValueCallBacks); CFDictionaryAddValue(mappingTable, (const void*)encoding, (const void*)theName); CFRelease(theName); } } return theName; } enum { NSASCIIStringEncoding = 1, /* 0..127 only */ NSNEXTSTEPStringEncoding = 2, NSJapaneseEUCStringEncoding = 3, NSUTF8StringEncoding = 4, NSISOLatin1StringEncoding = 5, NSSymbolStringEncoding = 6, NSNonLossyASCIIStringEncoding = 7, NSShiftJISStringEncoding = 8, NSISOLatin2StringEncoding = 9, NSUnicodeStringEncoding = 10, NSWindowsCP1251StringEncoding = 11, /* Cyrillic; same as AdobeStandardCyrillic */ NSWindowsCP1252StringEncoding = 12, /* WinLatin1 */ NSWindowsCP1253StringEncoding = 13, /* Greek */ NSWindowsCP1254StringEncoding = 14, /* Turkish */ NSWindowsCP1250StringEncoding = 15, /* WinLatin2 */ NSISO2022JPStringEncoding = 21, /* ISO 2022 Japanese encoding for e-mail */ NSMacOSRomanStringEncoding = 30, NSProprietaryStringEncoding = 65536 /* Installation-specific encoding */ }; #define NSENCODING_MASK (1 << 31) UInt32 CFStringConvertEncodingToNSStringEncoding(CFStringEncoding theEncoding) { switch (theEncoding & 0xFFF) { case kCFStringEncodingASCII: return NSASCIIStringEncoding; case kCFStringEncodingNextStepLatin: return NSNEXTSTEPStringEncoding; case kCFStringEncodingISOLatin1: return NSISOLatin1StringEncoding; case kCFStringEncodingNonLossyASCII: return NSNonLossyASCIIStringEncoding; case kCFStringEncodingWindowsLatin1: return NSWindowsCP1252StringEncoding; case kCFStringEncodingMacRoman: return NSMacOSRomanStringEncoding; #if defined(__MACH__) case kCFStringEncodingEUC_JP: return NSJapaneseEUCStringEncoding; case kCFStringEncodingMacSymbol: return NSSymbolStringEncoding; case kCFStringEncodingDOSJapanese: return NSShiftJISStringEncoding; case kCFStringEncodingISOLatin2: return NSISOLatin2StringEncoding; case kCFStringEncodingWindowsCyrillic: return NSWindowsCP1251StringEncoding; case kCFStringEncodingWindowsGreek: return NSWindowsCP1253StringEncoding; case kCFStringEncodingWindowsLatin5: return NSWindowsCP1254StringEncoding; case kCFStringEncodingWindowsLatin2: return NSWindowsCP1250StringEncoding; case kCFStringEncodingISO_2022_JP: return NSISO2022JPStringEncoding; case kCFStringEncodingUnicode: if (theEncoding == kCFStringEncodingUTF16) return NSUnicodeStringEncoding; else if (theEncoding == kCFStringEncodingUTF8) return NSUTF8StringEncoding; #endif // __MACH__ /* fall-through for other encoding schemes */ default: return NSENCODING_MASK | theEncoding; } } CFStringEncoding CFStringConvertNSStringEncodingToEncoding(UInt32 theEncoding) { switch (theEncoding) { case NSASCIIStringEncoding: return kCFStringEncodingASCII; case NSNEXTSTEPStringEncoding: return kCFStringEncodingNextStepLatin; case NSUTF8StringEncoding: return kCFStringEncodingUTF8; case NSISOLatin1StringEncoding: return kCFStringEncodingISOLatin1; case NSNonLossyASCIIStringEncoding: return kCFStringEncodingNonLossyASCII; case NSUnicodeStringEncoding: return kCFStringEncodingUTF16; case NSWindowsCP1252StringEncoding: return kCFStringEncodingWindowsLatin1; case NSMacOSRomanStringEncoding: return kCFStringEncodingMacRoman; #if defined(__MACH__) case NSSymbolStringEncoding: return kCFStringEncodingMacSymbol; case NSJapaneseEUCStringEncoding: return kCFStringEncodingEUC_JP; case NSShiftJISStringEncoding: return kCFStringEncodingDOSJapanese; case NSISO2022JPStringEncoding: return kCFStringEncodingISO_2022_JP; case NSISOLatin2StringEncoding: return kCFStringEncodingISOLatin2; case NSWindowsCP1251StringEncoding: return kCFStringEncodingWindowsCyrillic; case NSWindowsCP1253StringEncoding: return kCFStringEncodingWindowsGreek; case NSWindowsCP1254StringEncoding: return kCFStringEncodingWindowsLatin5; case NSWindowsCP1250StringEncoding: return kCFStringEncodingWindowsLatin2; #endif // __MACH__ default: return ((theEncoding & NSENCODING_MASK) ? theEncoding & ~NSENCODING_MASK : kCFStringEncodingInvalidId); } } #define MACCODEPAGE_BASE (10000) #define ISO8859CODEPAGE_BASE (28590) static const uint16_t _CFToDOSCodePageList[] = { 437, -1, -1, -1, -1, 737, 775, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x400 850, 851, 852, 855, 857, 860, 861, 862, 863, 864, 865, 866, 869, 874, -1, 01, // 0x410 932, 936, 949 , 950, // 0x420 }; static const uint16_t _CFToWindowsCodePageList[] = { 1252, 1250, 1251, 1253, 1254, 1255, 1256, 1257, 1258, }; static const uint16_t _CFEUCToCodePage[] = { // 0x900 51932, 51936, 51950, 51949, }; UInt32 CFStringConvertEncodingToWindowsCodepage(CFStringEncoding theEncoding) { #if defined(__MACH__) CFStringEncoding encodingBase = theEncoding & 0x0FFF; #endif switch (theEncoding & 0x0F00) { #if defined(__MACH__) case 0: // Mac OS script if (encodingBase <= kCFStringEncodingMacCentralEurRoman) { return MACCODEPAGE_BASE + encodingBase; } else if (encodingBase == kCFStringEncodingMacTurkish) { return 10081; } else if (encodingBase == kCFStringEncodingMacCroatian) { return 10082; } else if (encodingBase == kCFStringEncodingMacIcelandic) { return 10079; } break; #endif case 0x100: // Unicode switch (theEncoding) { case kCFStringEncodingUTF8: return 65001; case kCFStringEncodingUTF16: return 1200; case kCFStringEncodingUTF16BE: return 1201; case kCFStringEncodingUTF32: return 65005; case kCFStringEncodingUTF32BE: return 65006; } break; #if defined(__MACH__) case 0x0200: // ISO 8859 series if (encodingBase <= kCFStringEncodingISOLatin10) return ISO8859CODEPAGE_BASE + (encodingBase - 0x200); break; case 0x0400: // DOS codepage if (encodingBase <= kCFStringEncodingDOSChineseTrad) return _CFToDOSCodePageList[encodingBase - 0x400]; break; case 0x0500: // ANSI (Windows) codepage if (encodingBase <= kCFStringEncodingWindowsVietnamese) return _CFToWindowsCodePageList[theEncoding - 0x500]; else if (encodingBase == kCFStringEncodingWindowsKoreanJohab) return 1361; break; case 0x600: // National standards if (encodingBase == kCFStringEncodingASCII) return 20127; else if (encodingBase == kCFStringEncodingGB_18030_2000) return 54936; break; case 0x0800: // ISO 2022 series switch (encodingBase) { case kCFStringEncodingISO_2022_JP: return 50220; case kCFStringEncodingISO_2022_CN: return 50227; case kCFStringEncodingISO_2022_KR: return 50225; } break; case 0x0900: // EUC series if (encodingBase <= kCFStringEncodingEUC_KR) return _CFEUCToCodePage[encodingBase - 0x0900]; break; case 0x0A00: // Misc encodings switch (encodingBase) { case kCFStringEncodingKOI8_R: return 20866; case kCFStringEncodingHZ_GB_2312: return 52936; case kCFStringEncodingKOI8_U: return 21866; } break; case 0x0C00: // IBM EBCDIC encodings if (encodingBase == kCFStringEncodingEBCDIC_CP037) return 37; break; #endif // __MACH__ } return kCFStringEncodingInvalidId; } #if defined(__MACH__) static const struct { uint16_t acp; uint16_t encoding; } _CFACPToCFTable[] = { {37, kCFStringEncodingEBCDIC_CP037}, {437, kCFStringEncodingDOSLatinUS}, {737, kCFStringEncodingDOSGreek}, {775, kCFStringEncodingDOSBalticRim}, {850, kCFStringEncodingDOSLatin1}, {851, kCFStringEncodingDOSGreek1}, {852, kCFStringEncodingDOSLatin2}, {855, kCFStringEncodingDOSCyrillic}, {857, kCFStringEncodingDOSTurkish}, {860, kCFStringEncodingDOSPortuguese}, {861, kCFStringEncodingDOSIcelandic}, {862, kCFStringEncodingDOSHebrew}, {863, kCFStringEncodingDOSCanadianFrench}, {864, kCFStringEncodingDOSArabic}, {865, kCFStringEncodingDOSNordic}, {866, kCFStringEncodingDOSRussian}, {869, kCFStringEncodingDOSGreek2}, {874, kCFStringEncodingDOSThai}, {932, kCFStringEncodingDOSJapanese}, {936, kCFStringEncodingDOSChineseSimplif}, {949, kCFStringEncodingDOSKorean}, {950, kCFStringEncodingDOSChineseTrad}, {1250, kCFStringEncodingWindowsLatin2}, {1251, kCFStringEncodingWindowsCyrillic}, {1252, kCFStringEncodingWindowsLatin1}, {1253, kCFStringEncodingWindowsGreek}, {1254, kCFStringEncodingWindowsLatin5}, {1255, kCFStringEncodingWindowsHebrew}, {1256, kCFStringEncodingWindowsArabic}, {1257, kCFStringEncodingWindowsBalticRim}, {1258, kCFStringEncodingWindowsVietnamese}, {1361, kCFStringEncodingWindowsKoreanJohab}, {20127, kCFStringEncodingASCII}, {20866, kCFStringEncodingKOI8_R}, {21866, kCFStringEncodingKOI8_U}, {50220, kCFStringEncodingISO_2022_JP}, {50225, kCFStringEncodingISO_2022_KR}, {50227, kCFStringEncodingISO_2022_CN}, {51932, kCFStringEncodingEUC_JP}, {51936, kCFStringEncodingEUC_CN}, {51949, kCFStringEncodingEUC_KR}, {51950, kCFStringEncodingEUC_TW}, {52936, kCFStringEncodingHZ_GB_2312}, {54936, kCFStringEncodingGB_18030_2000}, }; static SInt32 bsearchEncoding(uint16_t target) { const unsigned int *start, *end, *divider; unsigned int size = sizeof(_CFACPToCFTable) / sizeof(UInt32); start = (const unsigned int*)_CFACPToCFTable; end = (const unsigned int*)_CFACPToCFTable + (size - 1); while (start <= end) { divider = start + ((end - start) / 2); if (*(const uint16_t*)divider == target) return *((const uint16_t*)divider + 1); else if (*(const uint16_t*)divider > target) end = divider - 1; else if (*(const uint16_t*)(divider + 1) > target) return *((const uint16_t*)divider + 1); else start = divider + 1; } return (kCFStringEncodingInvalidId); } #endif CFStringEncoding CFStringConvertWindowsCodepageToEncoding(UInt32 theEncoding) { if (theEncoding == 0 || theEncoding == 1) { // ID for default (system) codepage return CFStringGetSystemEncoding(); } else if ((theEncoding >= MACCODEPAGE_BASE) && (theEncoding < 20000)) { // Mac script if (theEncoding <= 10029) return theEncoding - MACCODEPAGE_BASE; // up to Mac Central European #if defined(__MACH__) else if (theEncoding == 10079) return kCFStringEncodingMacIcelandic; else if (theEncoding == 10081) return kCFStringEncodingMacTurkish; else if (theEncoding == 10082) return kCFStringEncodingMacCroatian; #endif } else if ((theEncoding >= ISO8859CODEPAGE_BASE) && (theEncoding <= 28605)) { // ISO 8859 return (theEncoding - ISO8859CODEPAGE_BASE) + 0x200; } else if (theEncoding == 65001) { // UTF-8 return kCFStringEncodingUTF8; } else if (theEncoding == 12000) { // UTF-16 return kCFStringEncodingUTF16; } else if (theEncoding == 12001) { // UTF-16BE return kCFStringEncodingUTF16BE; } else if (theEncoding == 65005) { // UTF-32 return kCFStringEncodingUTF32; } else if (theEncoding == 65006) { // UTF-32BE return kCFStringEncodingUTF32BE; } else { #if defined(__MACH__) return bsearchEncoding(theEncoding); #endif } return kCFStringEncodingInvalidId; } CFStringEncoding CFStringGetMostCompatibleMacStringEncoding(CFStringEncoding encoding) { CFStringEncoding macEncoding; macEncoding = CFStringEncodingGetScriptCodeForEncoding(encoding); return macEncoding; }