| /* CFUnicodeDecomposition.c |
| Copyright (c) 1999-2016, Apple Inc. and the Swift project authors |
| |
| Portions Copyright (c) 2014-2016 Apple Inc. and the Swift project authors |
| Licensed under Apache License v2.0 with Runtime Library Exception |
| See http://swift.org/LICENSE.txt for license information |
| See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors |
| Responsibility: Foundation Team |
| */ |
| |
| #include <string.h> |
| #include <CoreFoundation/CFBase.h> |
| #include <CoreFoundation/CFCharacterSet.h> |
| #include <CoreFoundation/CFUniChar.h> |
| #include <CoreFoundation/CFUnicodeDecomposition.h> |
| #include "CFInternal.h" |
| #include "CFUniCharPriv.h" |
| |
| // Canonical Decomposition |
| static UTF32Char *__CFUniCharDecompositionTable = NULL; |
| static uint32_t __CFUniCharDecompositionTableLength = 0; |
| static UTF32Char *__CFUniCharMultipleDecompositionTable = NULL; |
| |
| static const uint8_t *__CFUniCharDecomposableBitmapForBMP = NULL; |
| static const uint8_t *__CFUniCharHFSPlusDecomposableBitmapForBMP = NULL; |
| |
| static CFLock_t __CFUniCharDecompositionTableLock = CFLockInit; |
| |
| static const uint8_t **__CFUniCharCombiningPriorityTable = NULL; |
| static uint8_t __CFUniCharCombiningPriorityTableNumPlane = 0; |
| |
| static void __CFUniCharLoadDecompositionTable(void) { |
| |
| __CFLock(&__CFUniCharDecompositionTableLock); |
| |
| if (NULL == __CFUniCharDecompositionTable) { |
| const uint32_t *bytes = (uint32_t *)CFUniCharGetMappingData(kCFUniCharCanonicalDecompMapping); |
| |
| if (NULL == bytes) { |
| __CFUnlock(&__CFUniCharDecompositionTableLock); |
| return; |
| } |
| |
| __CFUniCharDecompositionTableLength = *(bytes++); |
| __CFUniCharDecompositionTable = (UTF32Char *)bytes; |
| __CFUniCharMultipleDecompositionTable = (UTF32Char *)((intptr_t)bytes + __CFUniCharDecompositionTableLength); |
| |
| __CFUniCharDecompositionTableLength /= (sizeof(uint32_t) * 2); |
| __CFUniCharDecomposableBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet, 0); |
| __CFUniCharHFSPlusDecomposableBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharHFSPlusDecomposableCharacterSet, 0); |
| |
| CFIndex idx; |
| |
| __CFUniCharCombiningPriorityTableNumPlane = CFUniCharGetNumberOfPlanesForUnicodePropertyData(kCFUniCharCombiningProperty); |
| __CFUniCharCombiningPriorityTable = (const uint8_t **)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(uint8_t *) * __CFUniCharCombiningPriorityTableNumPlane, 0); |
| for (idx = 0;idx < __CFUniCharCombiningPriorityTableNumPlane;idx++) __CFUniCharCombiningPriorityTable[idx] = (const uint8_t *)CFUniCharGetUnicodePropertyDataForPlane(kCFUniCharCombiningProperty, idx); |
| } |
| |
| __CFUnlock(&__CFUniCharDecompositionTableLock); |
| } |
| |
| static CFLock_t __CFUniCharCompatibilityDecompositionTableLock = CFLockInit; |
| static UTF32Char *__CFUniCharCompatibilityDecompositionTable = NULL; |
| static uint32_t __CFUniCharCompatibilityDecompositionTableLength = 0; |
| static UTF32Char *__CFUniCharCompatibilityMultipleDecompositionTable = NULL; |
| |
| static void __CFUniCharLoadCompatibilityDecompositionTable(void) { |
| |
| __CFLock(&__CFUniCharCompatibilityDecompositionTableLock); |
| |
| if (NULL == __CFUniCharCompatibilityDecompositionTable) { |
| const uint32_t *bytes = (uint32_t *)CFUniCharGetMappingData(kCFUniCharCompatibilityDecompMapping); |
| |
| if (NULL == bytes) { |
| __CFUnlock(&__CFUniCharCompatibilityDecompositionTableLock); |
| return; |
| } |
| |
| __CFUniCharCompatibilityDecompositionTableLength = *(bytes++); |
| __CFUniCharCompatibilityDecompositionTable = (UTF32Char *)bytes; |
| __CFUniCharCompatibilityMultipleDecompositionTable = (UTF32Char *)((intptr_t)bytes + __CFUniCharCompatibilityDecompositionTableLength); |
| |
| __CFUniCharCompatibilityDecompositionTableLength /= (sizeof(uint32_t) * 2); |
| } |
| |
| __CFUnlock(&__CFUniCharCompatibilityDecompositionTableLock); |
| } |
| |
| CF_INLINE bool __CFUniCharIsDecomposableCharacterWithFlag(UTF32Char character, bool isHFSPlus) { |
| return CFUniCharIsMemberOfBitmap(character, (character < 0x10000 ? (isHFSPlus ? __CFUniCharHFSPlusDecomposableBitmapForBMP : __CFUniCharDecomposableBitmapForBMP) : CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet, ((character >> 16) & 0xFF)))); |
| } |
| |
| CF_INLINE uint8_t __CFUniCharGetCombiningPropertyForCharacter(UTF32Char character) { return CFUniCharGetCombiningPropertyForCharacter(character, (((character) >> 16) < __CFUniCharCombiningPriorityTableNumPlane ? __CFUniCharCombiningPriorityTable[(character) >> 16] : NULL)); } |
| |
| CF_INLINE bool __CFUniCharIsNonBaseCharacter(UTF32Char character) { return ((0 == __CFUniCharGetCombiningPropertyForCharacter(character)) ? false : true); } // the notion of non-base in normalization is characters with non-0 combining class |
| |
| typedef struct { |
| uint32_t _key; |
| uint32_t _value; |
| } __CFUniCharDecomposeMappings; |
| |
| static uint32_t __CFUniCharGetMappedValue(const __CFUniCharDecomposeMappings *theTable, uint32_t numElem, UTF32Char character) { |
| const __CFUniCharDecomposeMappings *p, *q, *divider; |
| |
| if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key)) { |
| return 0; |
| } |
| p = theTable; |
| q = p + (numElem-1); |
| while (p <= q) { |
| divider = p + ((q - p) >> 1); /* divide by 2 */ |
| if (character < divider->_key) { q = divider - 1; } |
| else if (character > divider->_key) { p = divider + 1; } |
| else { return divider->_value; } |
| } |
| return 0; |
| } |
| |
| static void __CFUniCharPrioritySort(UTF32Char *characters, CFIndex length) { |
| UTF32Char *end = characters + length; |
| |
| while ((characters < end) && (0 == __CFUniCharGetCombiningPropertyForCharacter(*characters))) ++characters; |
| |
| if ((end - characters) > 1) { |
| uint32_t p1, p2; |
| UTF32Char *ch1, *ch2; |
| bool changes = true; |
| |
| do { |
| changes = false; |
| ch1 = characters; ch2 = characters + 1; |
| p2 = __CFUniCharGetCombiningPropertyForCharacter(*ch1); |
| while (ch2 < end) { |
| p1 = p2; p2 = __CFUniCharGetCombiningPropertyForCharacter(*ch2); |
| if (p1 > p2) { |
| UTF32Char tmp = *ch1; *ch1 = *ch2; *ch2 = tmp; |
| changes = true; |
| } |
| ++ch1; ++ch2; |
| } |
| } while (changes); |
| } |
| } |
| |
| static CFIndex __CFUniCharRecursivelyDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars, CFIndex maxBufferLength) { |
| uint32_t value = __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings *)__CFUniCharDecompositionTable, __CFUniCharDecompositionTableLength, character); |
| CFIndex length = CFUniCharConvertFlagToCount(value); |
| UTF32Char firstChar = value & 0xFFFFFF; |
| UTF32Char *mappings = (length > 1 ? __CFUniCharMultipleDecompositionTable + firstChar : &firstChar); |
| CFIndex usedLength = 0; |
| |
| if (maxBufferLength < length) return 0; |
| |
| if (value & kCFUniCharRecursiveDecompositionFlag) { |
| usedLength = __CFUniCharRecursivelyDecomposeCharacter(*mappings, convertedChars, maxBufferLength - length); |
| |
| --length; // Decrement for the first char |
| if (!usedLength || usedLength + length > maxBufferLength) return 0; |
| ++mappings; |
| convertedChars += usedLength; |
| } |
| |
| usedLength += length; |
| |
| while (length--) *(convertedChars++) = *(mappings++); |
| |
| return usedLength; |
| } |
| |
| #define HANGUL_SBASE 0xAC00 |
| #define HANGUL_LBASE 0x1100 |
| #define HANGUL_VBASE 0x1161 |
| #define HANGUL_TBASE 0x11A7 |
| #define HANGUL_SCOUNT 11172 |
| #define HANGUL_LCOUNT 19 |
| #define HANGUL_VCOUNT 21 |
| #define HANGUL_TCOUNT 28 |
| #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT) |
| |
| CFIndex CFUniCharDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars, CFIndex maxBufferLength) { |
| if (NULL == __CFUniCharDecompositionTable) __CFUniCharLoadDecompositionTable(); |
| if (character >= HANGUL_SBASE && character <= (HANGUL_SBASE + HANGUL_SCOUNT)) { |
| CFIndex length; |
| |
| character -= HANGUL_SBASE; |
| |
| length = (character % HANGUL_TCOUNT ? 3 : 2); |
| |
| if (maxBufferLength < length) return 0; |
| |
| *(convertedChars++) = character / HANGUL_NCOUNT + HANGUL_LBASE; |
| *(convertedChars++) = (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE; |
| if (length > 2) *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE; |
| return length; |
| } else { |
| return __CFUniCharRecursivelyDecomposeCharacter(character, convertedChars, maxBufferLength); |
| } |
| } |
| |
| CF_INLINE bool __CFProcessReorderBuffer(UTF32Char *buffer, CFIndex length, void **dst, CFIndex dstLength, CFIndex *filledLength, uint32_t dstFormat) { |
| if (length > 1) __CFUniCharPrioritySort(buffer, length); |
| return CFUniCharFillDestinationBuffer(buffer, length, dst, dstLength, filledLength, dstFormat); |
| } |
| |
| #define MAX_BUFFER_LENGTH (32) |
| bool CFUniCharDecomposeWithErrorLocation(const UTF16Char *src, CFIndex length, CFIndex *consumedLength, void *dst, CFIndex maxLength, CFIndex *filledLength, bool needToReorder, uint32_t dstFormat, bool isHFSPlus, CFIndex *charIndex) { |
| CFIndex usedLength = 0; |
| const UTF16Char * const originalSrc = src; |
| CFIndex originalLength = length; |
| UTF32Char buffer[MAX_BUFFER_LENGTH]; |
| UTF32Char *decompBuffer = buffer; |
| CFIndex decompBufferSize = MAX_BUFFER_LENGTH; |
| CFIndex decompBufferLen = 0; |
| CFIndex segmentLength = 0; |
| UTF32Char currentChar; |
| |
| // kCFNotFound indicates an insufficiently sized buffer, which is the default failure case. |
| if (charIndex) *charIndex = kCFNotFound; |
| |
| if (NULL == __CFUniCharDecompositionTable) __CFUniCharLoadDecompositionTable(); |
| |
| while ((length - segmentLength) > 0) { |
| currentChar = *(src++); |
| |
| if (currentChar < 0x80) { |
| if (decompBufferLen > 0) { |
| if (!__CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break; |
| length -= segmentLength; |
| segmentLength = 0; |
| decompBufferLen = 0; |
| } |
| |
| if (maxLength > 0) { |
| if (usedLength >= maxLength) break; |
| switch (dstFormat) { |
| case kCFUniCharUTF8Format: *(uint8_t *)dst = currentChar; dst = (uint8_t *)dst + sizeof(uint8_t); break; |
| case kCFUniCharUTF16Format: *(UTF16Char *)dst = currentChar; dst = (uint8_t *)dst + sizeof(UTF16Char); break; |
| case kCFUniCharUTF32Format: *(UTF32Char *)dst = currentChar; dst = (uint8_t *)dst + sizeof(UTF32Char); break; |
| } |
| } |
| |
| --length; |
| ++usedLength; |
| } else { |
| if (CFUniCharIsSurrogateLowCharacter(currentChar)) { // Stray surrogagte |
| if (dstFormat != kCFUniCharUTF16Format) { |
| if (charIndex) *charIndex = src - 1 - originalSrc; |
| break; |
| } |
| } else if (CFUniCharIsSurrogateHighCharacter(currentChar)) { |
| if (((length - segmentLength) > 1) && CFUniCharIsSurrogateLowCharacter(*src)) { |
| currentChar = CFUniCharGetLongCharacterForSurrogatePair(currentChar, *(src++)); |
| } else { |
| if (dstFormat != kCFUniCharUTF16Format) { |
| if (charIndex) *charIndex = src - originalSrc; |
| break; |
| } |
| } |
| } |
| |
| if (needToReorder && __CFUniCharIsNonBaseCharacter(currentChar)) { |
| if ((decompBufferLen + 1) >= decompBufferSize) { |
| UTF32Char *newBuffer; |
| |
| decompBufferSize += MAX_BUFFER_LENGTH; |
| newBuffer = (UTF32Char *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(UTF32Char) * decompBufferSize, 0); |
| memmove(newBuffer, decompBuffer, (decompBufferSize - MAX_BUFFER_LENGTH) * sizeof(UTF32Char)); |
| if (decompBuffer != buffer) CFAllocatorDeallocate(kCFAllocatorSystemDefault, decompBuffer); |
| decompBuffer = newBuffer; |
| } |
| |
| if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, isHFSPlus)) { // Vietnamese accent, etc. |
| decompBufferLen += CFUniCharDecomposeCharacter(currentChar, decompBuffer + decompBufferLen, decompBufferSize - decompBufferLen); |
| } else { |
| decompBuffer[decompBufferLen++] = currentChar; |
| } |
| } else { |
| if (decompBufferLen > 0) { |
| if (!__CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break; |
| length -= segmentLength; |
| segmentLength = 0; |
| } |
| |
| if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, isHFSPlus)) { |
| decompBufferLen = CFUniCharDecomposeCharacter(currentChar, decompBuffer, MAX_BUFFER_LENGTH); |
| } else { |
| decompBufferLen = 1; |
| *decompBuffer = currentChar; |
| } |
| |
| if (!needToReorder || (decompBufferLen == 1)) { |
| if (!CFUniCharFillDestinationBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break; |
| length -= ((currentChar > 0xFFFF) ? 2 : 1); |
| decompBufferLen = 0; |
| continue; |
| } |
| } |
| |
| segmentLength += ((currentChar > 0xFFFF) ? 2 : 1); |
| } |
| } |
| |
| if ((decompBufferLen > 0) && __CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) length -= segmentLength; |
| |
| if (decompBuffer != buffer) CFAllocatorDeallocate(kCFAllocatorSystemDefault, decompBuffer); |
| |
| if (consumedLength) *consumedLength = originalLength - length; |
| if (filledLength) *filledLength = usedLength; |
| |
| return ((length > 0) ? false : true); |
| } |
| |
| bool CFUniCharDecompose(const UTF16Char *src, CFIndex length, CFIndex *consumedLength, void *dst, CFIndex maxLength, CFIndex *filledLength, bool needToReorder, uint32_t dstFormat, bool isHFSPlus) { |
| return CFUniCharDecomposeWithErrorLocation(src, length, consumedLength, dst, maxLength, filledLength, needToReorder, dstFormat, isHFSPlus, NULL); |
| } |
| |
| #define MAX_COMP_DECOMP_LEN (32) |
| |
| static CFIndex __CFUniCharRecursivelyCompatibilityDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars) { |
| uint32_t value = __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings *)__CFUniCharCompatibilityDecompositionTable, __CFUniCharCompatibilityDecompositionTableLength, character); |
| CFIndex length = CFUniCharConvertFlagToCount(value); |
| UTF32Char firstChar = value & 0xFFFFFF; |
| const UTF32Char *mappings = (length > 1 ? __CFUniCharCompatibilityMultipleDecompositionTable + firstChar : &firstChar); |
| CFIndex usedLength = length; |
| UTF32Char currentChar; |
| CFIndex currentLength; |
| |
| while (length-- > 0) { |
| currentChar = *(mappings++); |
| if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, false)) { |
| currentLength = __CFUniCharRecursivelyDecomposeCharacter(currentChar, convertedChars, MAX_COMP_DECOMP_LEN - length); |
| convertedChars += currentLength; |
| usedLength += (currentLength - 1); |
| } else if (CFUniCharIsMemberOf(currentChar, kCFUniCharCompatibilityDecomposableCharacterSet)) { |
| currentLength = __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar, convertedChars); |
| convertedChars += currentLength; |
| usedLength += (currentLength - 1); |
| } else { |
| *(convertedChars++) = currentChar; |
| } |
| } |
| |
| return usedLength; |
| } |
| |
| CF_INLINE void __CFUniCharMoveBufferFromEnd1(UTF32Char *convertedChars, CFIndex length, CFIndex delta) { |
| const UTF32Char *limit = convertedChars; |
| UTF32Char *dstP; |
| |
| convertedChars += length; |
| dstP = convertedChars + delta; |
| |
| while (convertedChars > limit) *(--dstP) = *(--convertedChars); |
| } |
| |
| CF_PRIVATE CFIndex CFUniCharCompatibilityDecompose(UTF32Char *convertedChars, CFIndex length, CFIndex maxBufferLength) { |
| UTF32Char currentChar; |
| UTF32Char buffer[MAX_COMP_DECOMP_LEN]; |
| const UTF32Char *bufferP; |
| const UTF32Char *limit = convertedChars + length; |
| CFIndex filledLength; |
| |
| if (NULL == __CFUniCharCompatibilityDecompositionTable) __CFUniCharLoadCompatibilityDecompositionTable(); |
| |
| while (convertedChars < limit) { |
| currentChar = *convertedChars; |
| |
| if (CFUniCharIsMemberOf(currentChar, kCFUniCharCompatibilityDecomposableCharacterSet)) { |
| filledLength = __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar, buffer); |
| |
| if (filledLength + length - 1 > maxBufferLength) return 0; |
| |
| if (filledLength > 1) __CFUniCharMoveBufferFromEnd1(convertedChars + 1, limit - convertedChars - 1, filledLength - 1); |
| |
| bufferP = buffer; |
| length += (filledLength - 1); |
| while (filledLength-- > 0) *(convertedChars++) = *(bufferP++); |
| } else { |
| ++convertedChars; |
| } |
| } |
| |
| return length; |
| } |
| |
| CF_EXPORT void CFUniCharPrioritySort(UTF32Char *characters, CFIndex length) { |
| __CFUniCharPrioritySort(characters, length); |
| } |
| |
| #undef MAX_BUFFER_LENGTH |
| #undef MAX_COMP_DECOMP_LEN |
| #undef HANGUL_SBASE |
| #undef HANGUL_LBASE |
| #undef HANGUL_VBASE |
| #undef HANGUL_TBASE |
| #undef HANGUL_SCOUNT |
| #undef HANGUL_LCOUNT |
| #undef HANGUL_VCOUNT |
| #undef HANGUL_TCOUNT |
| #undef HANGUL_NCOUNT |
| |