/* * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this * file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_LICENSE_HEADER_END@ */ /* CFUnicodeDecomposition.c Copyright 1999-2002, Apple, Inc. All rights reserved. Responsibility: Aki Inoue */ #include #if KERNEL #include #include "CFUnicodeDecomposition.h" #include "CFUniCharNonBaseData.h" #include "CFUniCharDecompData.h" #include "CFUniCharCombiningPriorityData.h" #else KERNEL #include #include #include "CFUniChar.h" #include "CFUnicodeDecomposition.h" #include "CFInternal.h" #include "CFUniCharPriv.h" #endif KERNEL // Canonical Decomposition #if KERNEL static const uint32_t __CFUniCharDecompositionTableLength = (sizeof(__CFUniCharDecompositionTable) / (sizeof(uint32_t) * 2)); // The kernel version of __CFUniCharIsDecomposableCharacter doesn't have exception ranges #define __CFUniCharIsDecomposableCharacterWithFlag(character,isHFSPlus) (__CFUniCharIsDecomposableCharacter(character)) uint8_t **CFUniCharCombiningPriorityTable = __CFUniCharCombiningPriorityTable; uint8_t **CFUniCharCombiningPriorityExtraTable = __CFUniCharCombiningPriorityExtraTable; uint8_t CFUniCharNumberOfPlanesForCombiningPriority = sizeof(__CFUniCharCombiningPriorityTable) / sizeof(*__CFUniCharCombiningPriorityTable); uint8_t **CFUniCharNonBaseBitmap = __CFUniCharNonBaseBitmap; uint8_t CFUniCharNumberOfPlanesForNonBaseBitmap = sizeof(__CFUniCharNonBaseBitmap) / sizeof(*__CFUniCharNonBaseBitmap); #else KERNEL static UTF32Char *__CFUniCharDecompositionTable = NULL; static uint32_t __CFUniCharDecompositionTableLength = 0; static UTF32Char *__CFUniCharMultipleDecompositionTable = NULL; static const uint8_t *__CFUniCharDecomposableBitmapForBMP = NULL; static const uint8_t *__CFUniCharHFSPlusDecomposableBitmapForBMP = NULL; static const uint8_t *__CFUniCharNonBaseBitmapForBMP = NULL; static CFSpinLock_t __CFUniCharDecompositionTableLock = 0; static const uint8_t **__CFUniCharCombiningPriorityTable = NULL; static uint8_t __CFUniCharCombiningPriorityTableNumPlane = 0; static void __CFUniCharLoadDecompositionTable(void) { __CFSpinLock(&__CFUniCharDecompositionTableLock); if (NULL == __CFUniCharDecompositionTable) { const void *bytes = CFUniCharGetMappingData(kCFUniCharCanonicalDecompMapping); if (NULL == bytes) { __CFSpinUnlock(&__CFUniCharDecompositionTableLock); return; } __CFUniCharDecompositionTableLength = *(((uint32_t *)bytes)++); __CFUniCharDecompositionTable = (UTF32Char *)bytes; __CFUniCharMultipleDecompositionTable = (UTF32Char *)((intptr_t)bytes + __CFUniCharDecompositionTableLength); __CFUniCharDecompositionTableLength /= (sizeof(uint32_t) * 2); __CFUniCharDecomposableBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet, 0); __CFUniCharHFSPlusDecomposableBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharHFSPlusDecomposableCharacterSet, 0); __CFUniCharNonBaseBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0); } __CFSpinUnlock(&__CFUniCharDecompositionTableLock); } static CFSpinLock_t __CFUniCharCompatibilityDecompositionTableLock = 0; static UTF32Char *__CFUniCharCompatibilityDecompositionTable = NULL; static uint32_t __CFUniCharCompatibilityDecompositionTableLength = 0; static UTF32Char *__CFUniCharCompatibilityMultipleDecompositionTable = NULL; static void __CFUniCharLoadCompatibilityDecompositionTable(void) { __CFSpinLock(&__CFUniCharCompatibilityDecompositionTableLock); if (NULL == __CFUniCharCompatibilityDecompositionTable) { const void *bytes = CFUniCharGetMappingData(kCFUniCharCompatibilityDecompMapping); if (NULL == bytes) { __CFSpinUnlock(&__CFUniCharCompatibilityDecompositionTableLock); return; } __CFUniCharCompatibilityDecompositionTableLength = *(((uint32_t *)bytes)++); __CFUniCharCompatibilityDecompositionTable = (UTF32Char *)bytes; __CFUniCharCompatibilityMultipleDecompositionTable = (UTF32Char *)((intptr_t)bytes + __CFUniCharCompatibilityDecompositionTableLength); __CFUniCharCompatibilityDecompositionTableLength /= (sizeof(uint32_t) * 2); } __CFSpinUnlock(&__CFUniCharCompatibilityDecompositionTableLock); } CF_INLINE bool __CFUniCharIsDecomposableCharacterWithFlag(UTF32Char character, bool isHFSPlus) { return CFUniCharIsMemberOfBitmap(character, (character < 0x10000 ? (isHFSPlus ? __CFUniCharHFSPlusDecomposableBitmapForBMP : __CFUniCharDecomposableBitmapForBMP) : CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet, ((character >> 16) & 0xFF)))); } CF_INLINE bool __CFUniCharIsNonBaseCharacter(UTF32Char character) { return CFUniCharIsMemberOfBitmap(character, (character < 0x10000 ? __CFUniCharNonBaseBitmapForBMP : CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, ((character >> 16) & 0xFF)))); } #endif KERNEL typedef struct { uint32_t _key; uint32_t _value; } __CFUniCharDecomposeMappings; static uint32_t __CFUniCharGetMappedValue(const __CFUniCharDecomposeMappings *theTable, uint32_t numElem, UTF32Char character) { const __CFUniCharDecomposeMappings *p, *q, *divider; if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key)) { return 0; } p = theTable; q = p + (numElem-1); while (p <= q) { divider = p + ((q - p) >> 1); /* divide by 2 */ if (character < divider->_key) { q = divider - 1; } else if (character > divider->_key) { p = divider + 1; } else { return divider->_value; } } return 0; } #if KERNEL #define __CFUniCharGetCombiningPropertyForCharacter(character) __CFUniCharGetCombiningPriority(character) #else KERNEL #define __CFUniCharGetCombiningPropertyForCharacter(character) CFUniCharGetCombiningPropertyForCharacter(character, (((character) >> 16) < __CFUniCharCombiningPriorityTableNumPlane ? __CFUniCharCombiningPriorityTable[(character) >> 16] : NULL)) #endif KERNEL static void __CFUniCharPrioritySort(UTF32Char *characters, uint32_t length) { uint32_t p1, p2; UTF32Char *ch1, *ch2; bool changes = true; UTF32Char *end = characters + length; #if !KERNEL if (NULL == __CFUniCharCombiningPriorityTable) { __CFSpinLock(&__CFUniCharDecompositionTableLock); if (NULL == __CFUniCharCombiningPriorityTable) { uint32_t numPlanes = CFUniCharGetNumberOfPlanesForUnicodePropertyData(kCFUniCharCombiningProperty); uint32_t idx; __CFUniCharCombiningPriorityTable = (const uint8_t **)CFAllocatorAllocate(NULL, sizeof(uint8_t *) * numPlanes, 0); for (idx = 0;idx < numPlanes;idx++) __CFUniCharCombiningPriorityTable[idx] = CFUniCharGetUnicodePropertyDataForPlane(kCFUniCharCombiningProperty, idx); __CFUniCharCombiningPriorityTableNumPlane = numPlanes; } __CFSpinUnlock(&__CFUniCharDecompositionTableLock); } #endif !KERNEL if (length < 2) return; do { changes = false; ch1 = characters; ch2 = characters + 1; p2 = __CFUniCharGetCombiningPropertyForCharacter(*ch1); while (ch2 < end) { p1 = p2; p2 = __CFUniCharGetCombiningPropertyForCharacter(*ch2); if (p1 > p2) { UTF32Char tmp = *ch1; *ch1 = *ch2; *ch2 = tmp; changes = true; } ++ch1; ++ch2; } } while (changes); } static uint32_t __CFUniCharRecursivelyDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars, uint32_t maxBufferLength) { uint32_t value = __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings *)__CFUniCharDecompositionTable, __CFUniCharDecompositionTableLength, character); uint32_t length = CFUniCharConvertFlagToCount(value); UTF32Char firstChar = value & 0xFFFFFF; #if KERNEL const UTF32Char *mappings = (kCFUniCharNonBmpFlag & value ? (length == 1 ? &firstChar : __CFUniCharNonBMPMultipleDecompositionTable + firstChar) : NULL); UTF16Char theChar = (UTF16Char)firstChar; const UTF16Char *bmpMappings = (mappings ? NULL : (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar)); #else KERNEL UTF32Char *mappings = (length > 1 ? __CFUniCharMultipleDecompositionTable + firstChar : &firstChar); #endif KERNEL uint32_t usedLength = 0; if (maxBufferLength < length) return 0; #if KERNEL if (bmpMappings) { if (value & kCFUniCharRecursiveDecompositionFlag) { usedLength = __CFUniCharRecursivelyDecomposeCharacter((UTF32Char)*bmpMappings, convertedChars, maxBufferLength - length); --length; // Decrement for the first char if (!usedLength || usedLength + length > maxBufferLength) return 0; ++bmpMappings; convertedChars += usedLength; } usedLength += length; while (length--) *(convertedChars++) = *(bmpMappings++); return usedLength; } #endif KERNEL if (value & kCFUniCharRecursiveDecompositionFlag) { usedLength = __CFUniCharRecursivelyDecomposeCharacter(*mappings, convertedChars, maxBufferLength - length); --length; // Decrement for the first char if (!usedLength || usedLength + length > maxBufferLength) return 0; ++mappings; convertedChars += usedLength; } usedLength += length; while (length--) *(convertedChars++) = *(mappings++); return usedLength; } #define HANGUL_SBASE 0xAC00 #define HANGUL_LBASE 0x1100 #define HANGUL_VBASE 0x1161 #define HANGUL_TBASE 0x11A7 #define HANGUL_SCOUNT 11172 #define HANGUL_LCOUNT 19 #define HANGUL_VCOUNT 21 #define HANGUL_TCOUNT 28 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT) uint32_t CFUniCharDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars, uint32_t maxBufferLength) { #if !KERNEL if (NULL == __CFUniCharDecompositionTable) __CFUniCharLoadDecompositionTable(); #endif !KERNEL if (character >= HANGUL_SBASE && character <= (HANGUL_SBASE + HANGUL_SCOUNT)) { uint32_t length; character -= HANGUL_SBASE; length = (character % HANGUL_TCOUNT ? 3 : 2); if (maxBufferLength < length) return 0; *(convertedChars++) = character / HANGUL_NCOUNT + HANGUL_LBASE; *(convertedChars++) = (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE; if (length > 2) *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE; return length; } else { return __CFUniCharRecursivelyDecomposeCharacter(character, convertedChars, maxBufferLength); } } #if KERNEL #define CFAllocatorAllocate(a,size,flag) malloc((size)) #define CFAllocatorDeallocate(a,ptr) free((ptr)) #endif KERNEL #define MAX_BUFFER_LENGTH (32) bool CFUniCharDecompose(const UTF16Char *src, uint32_t length, uint32_t *consumedLength, void *dst, uint32_t maxLength, uint32_t *filledLength, bool needToReorder, uint32_t dstFormat, bool isHFSPlus) { uint32_t usedLength = 0; uint32_t originalLength = length; UTF32Char buffer[MAX_BUFFER_LENGTH]; UTF32Char *decompBuffer = buffer; uint32_t decompBufferLen = MAX_BUFFER_LENGTH; UTF32Char currentChar; uint32_t idx; bool isDecomp = false; bool isNonBase = false; #if !KERNEL if (NULL == __CFUniCharDecompositionTable) __CFUniCharLoadDecompositionTable(); #endif !KERNEL while (length > 0) { currentChar = *(src++); --length; if (currentChar < 0x80) { if (maxLength) { if (usedLength < maxLength) { switch (dstFormat) { case kCFUniCharUTF8Format: *(((uint8_t *)dst)++) = currentChar; break; case kCFUniCharUTF16Format: *(((UTF16Char *)dst)++) = currentChar; break; case kCFUniCharUTF32Format: *(((UTF32Char *)dst)++) = currentChar; break; } } else { if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer); if (consumedLength) *consumedLength = originalLength - length - 1; if (filledLength) *filledLength = usedLength; return false; } } ++usedLength; continue; } if (CFUniCharIsSurrogateHighCharacter(currentChar) && (length > 0) && CFUniCharIsSurrogateLowCharacter(*src)) { currentChar = CFUniCharGetLongCharacterForSurrogatePair(currentChar, *(src++)); --length; } isDecomp = __CFUniCharIsDecomposableCharacterWithFlag(currentChar, isHFSPlus); isNonBase = (needToReorder && __CFUniCharIsNonBaseCharacter(currentChar)); if (!isDecomp || isNonBase) { if (isNonBase) { if (isDecomp) { idx = CFUniCharDecomposeCharacter(currentChar, decompBuffer, MAX_BUFFER_LENGTH); } else { idx = 1; *decompBuffer = currentChar; } while (length > 0) { if (CFUniCharIsSurrogateHighCharacter(*src) && ((length + 1) > 0) && CFUniCharIsSurrogateLowCharacter(*(src + 1))) { currentChar = CFUniCharGetLongCharacterForSurrogatePair(*src, *(src + 1)); } else { currentChar = *src; } if (__CFUniCharIsNonBaseCharacter(currentChar)) { if (currentChar > 0xFFFF) { // Non-BMP length -= 2; src += 2; } else { --length; ++src; } if ((idx + 1) >= decompBufferLen) { UTF32Char *newBuffer; decompBufferLen += MAX_BUFFER_LENGTH; newBuffer = (UTF32Char *)CFAllocatorAllocate(NULL, sizeof(UTF32Char) * decompBufferLen, 0); memmove(newBuffer, decompBuffer, (decompBufferLen - MAX_BUFFER_LENGTH) * sizeof(UTF32Char)); if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer); decompBuffer = newBuffer; } if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, isHFSPlus)) { // Vietnamese accent, etc. idx += CFUniCharDecomposeCharacter(currentChar, decompBuffer + idx, MAX_BUFFER_LENGTH - idx); } else { decompBuffer[idx++] = currentChar; } } else { break; } } if (idx > 1) { // Need to reorder __CFUniCharPrioritySort(decompBuffer, idx); } if (!CFUniCharFillDestinationBuffer(decompBuffer, idx, &dst, maxLength, &usedLength, dstFormat)) { if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer); if (consumedLength) *consumedLength = originalLength - length; if (filledLength) *filledLength = usedLength; return false; } } else { if (dstFormat == kCFUniCharUTF32Format) { ++usedLength; if (maxLength) { if (usedLength > maxLength) { if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer); if (consumedLength) *consumedLength = originalLength - length; if (filledLength) *filledLength = usedLength; return false; } *(((UTF32Char *)dst)++) = currentChar; } } else { if (!CFUniCharFillDestinationBuffer(¤tChar, 1, &dst, maxLength, &usedLength, dstFormat)) { if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer); if (consumedLength) *consumedLength = originalLength - length; if (filledLength) *filledLength = usedLength; return false; } } } } else { if (dstFormat == kCFUniCharUTF32Format && maxLength) { idx = CFUniCharDecomposeCharacter(currentChar, dst, maxLength - usedLength); if (idx == 0) { if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer); if (consumedLength) *consumedLength = originalLength - length; if (filledLength) *filledLength = usedLength; return false; } else if (needToReorder && (idx > 1)) { // Need to reorder bool moreCombiningMarks = false; ++((UTF32Char *)dst); --idx; ++usedLength; // Skip the base while (length > 0) { if (CFUniCharIsSurrogateHighCharacter(*src) && ((length + 1) > 0) && CFUniCharIsSurrogateLowCharacter(*(src + 1))) { currentChar = CFUniCharGetLongCharacterForSurrogatePair(*src, *(src + 1)); } else { currentChar = *src; } if (__CFUniCharIsNonBaseCharacter(currentChar)) { if (currentChar > 0xFFFF) { // Non-BMP length -= 2; src += 2; } else { --length; ++src; } if ((idx + usedLength + 1) >= maxLength) { if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer); if (consumedLength) *consumedLength = originalLength - length; if (filledLength) *filledLength = usedLength; return false; } ((UTF32Char *)dst)[idx++] = currentChar; moreCombiningMarks = true; } else { break; } } if (moreCombiningMarks) __CFUniCharPrioritySort(((UTF32Char *)dst), idx); } usedLength += idx; ((UTF32Char *)dst) += idx; } else { idx = CFUniCharDecomposeCharacter(currentChar, decompBuffer, decompBufferLen); if (maxLength && idx + usedLength > maxLength) { if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer); if (consumedLength) *consumedLength = originalLength - length; if (filledLength) *filledLength = usedLength; return false; } else if (needToReorder && (idx > 1)) { // Need to reorder bool moreCombiningMarks = false; while (length > 0) { if (CFUniCharIsSurrogateHighCharacter(*src) && ((length + 1) > 0) && CFUniCharIsSurrogateLowCharacter(*(src + 1))) { currentChar = CFUniCharGetLongCharacterForSurrogatePair(*src, *(src + 1)); } else { currentChar = *src; } if (__CFUniCharIsNonBaseCharacter(currentChar)) { if (currentChar > 0xFFFF) { // Non-BMP length -= 2; src += 2; } else { --length; ++src; } if ((idx + 1) >= decompBufferLen) { UTF32Char *newBuffer; decompBufferLen += MAX_BUFFER_LENGTH; newBuffer = (UTF32Char *)CFAllocatorAllocate(NULL, sizeof(UTF32Char) * decompBufferLen, 0); memmove(newBuffer, decompBuffer, (decompBufferLen - MAX_BUFFER_LENGTH) * sizeof(UTF32Char)); if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer); decompBuffer = newBuffer; } decompBuffer[idx++] = currentChar; moreCombiningMarks = true; } else { break; } } if (moreCombiningMarks) __CFUniCharPrioritySort(decompBuffer + 1, idx - 1); } if (!CFUniCharFillDestinationBuffer(decompBuffer, idx, &dst, maxLength, &usedLength, dstFormat)) { if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer); if (consumedLength) *consumedLength = originalLength - length; if (filledLength) *filledLength = usedLength; return false; } } } } if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer); if (consumedLength) *consumedLength = originalLength - length; if (filledLength) *filledLength = usedLength; return true; } #if !KERNEL #define MAX_COMP_DECOMP_LEN (32) static uint32_t __CFUniCharRecursivelyCompatibilityDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars) { uint32_t value = __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings *)__CFUniCharCompatibilityDecompositionTable, __CFUniCharCompatibilityDecompositionTableLength, character); uint32_t length = CFUniCharConvertFlagToCount(value); UTF32Char firstChar = value & 0xFFFFFF; const UTF32Char *mappings = (length > 1 ? __CFUniCharCompatibilityMultipleDecompositionTable + firstChar : &firstChar); uint32_t usedLength = length; UTF32Char currentChar; uint32_t currentLength; while (length-- > 0) { currentChar = *(mappings++); if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, false)) { currentLength = __CFUniCharRecursivelyDecomposeCharacter(currentChar, convertedChars, MAX_COMP_DECOMP_LEN - length); convertedChars += currentLength; usedLength += (currentLength - 1); } else if (CFUniCharIsMemberOf(currentChar, kCFUniCharCompatibilityDecomposableCharacterSet)) { currentLength = __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar, convertedChars); convertedChars += currentLength; usedLength += (currentLength - 1); } else { *(convertedChars++) = currentChar; } } return usedLength; } CF_INLINE void __CFUniCharMoveBufferFromEnd(UTF32Char *convertedChars, uint32_t length, uint32_t delta) { const UTF32Char *limit = convertedChars; UTF32Char *dstP; convertedChars += length; dstP = convertedChars + delta; while (convertedChars > limit) *(--dstP) = *(--convertedChars); } __private_extern__ uint32_t CFUniCharCompatibilityDecompose(UTF32Char *convertedChars, uint32_t length, uint32_t maxBufferLength) { UTF32Char currentChar; UTF32Char buffer[MAX_COMP_DECOMP_LEN]; const UTF32Char *bufferP; const UTF32Char *limit = convertedChars + length; uint32_t filledLength; if (NULL == __CFUniCharCompatibilityDecompositionTable) __CFUniCharLoadCompatibilityDecompositionTable(); while (convertedChars < limit) { currentChar = *convertedChars; if (CFUniCharIsMemberOf(currentChar, kCFUniCharCompatibilityDecomposableCharacterSet)) { filledLength = __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar, buffer); if (filledLength + length - 1 > maxBufferLength) return 0; if (filledLength > 1) __CFUniCharMoveBufferFromEnd(convertedChars + 1, limit - convertedChars - 1, filledLength - 1); bufferP = buffer; length += (filledLength - 1); while (filledLength-- > 0) *(convertedChars++) = *(bufferP++); } else { ++convertedChars; } } return length; } CF_EXPORT void CFUniCharPrioritySort(UTF32Char *characters, uint32_t length) { __CFUniCharPrioritySort(characters, length); } #endif !KERNEL