| /* |
| ********************************************************************** |
| * Copyright (C) 2007, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * file name: trieset.cpp |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2007jan15 |
| * created by: Markus Scherer |
| * |
| * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet |
| * using a UTrie with 8-bit (byte) results per code point. |
| * Modifies the trie index to make the BMP linear, and uses the original set |
| * for supplementary code points. |
| */ |
| |
| #include "unicode/utypes.h" |
| #include "unicont.h" |
| |
| #define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH) |
| |
| #define UTRIE_GET8_FROM_LEAD(trie, c16) \ |
| ((const uint8_t *)(trie)->data32)[ \ |
| ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \ |
| ((c16)&UTRIE_MASK) \ |
| ] |
| |
| class TrieSet : public UObject, public UnicodeContainable { |
| public: |
| TrieSet(const UnicodeSet &set, UErrorCode &errorCode) |
| : trieData(NULL), latin1(NULL), restSet(set.clone()) { |
| if(U_FAILURE(errorCode)) { |
| return; |
| } |
| if(restSet==NULL) { |
| errorCode=U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| UNewTrie *newTrie=utrie_open(NULL, NULL, 0x11000, 0, 0, TRUE); |
| UChar32 start, end; |
| |
| UnicodeSetIterator iter(set); |
| |
| while(iter.nextRange() && !iter.isString()) { |
| start=iter.getCodepoint(); |
| end=iter.getCodepointEnd(); |
| if(start>0xffff) { |
| break; |
| } |
| if(end>0xffff) { |
| end=0xffff; |
| } |
| if(!utrie_setRange32(newTrie, start, end+1, TRUE, TRUE)) { |
| errorCode=U_INTERNAL_PROGRAM_ERROR; |
| return; |
| } |
| } |
| |
| // Preflight the trie length. |
| int32_t length=utrie_serialize(newTrie, NULL, 0, NULL, 8, &errorCode); |
| if(errorCode!=U_BUFFER_OVERFLOW_ERROR) { |
| return; |
| } |
| |
| trieData=(uint32_t *)uprv_malloc(length); |
| if(trieData==NULL) { |
| errorCode=U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| errorCode=U_ZERO_ERROR; |
| utrie_serialize(newTrie, trieData, length, NULL, 8, &errorCode); |
| utrie_unserialize(&trie, trieData, length, &errorCode); // TODO: Implement for 8-bit UTrie! |
| |
| if(U_SUCCESS(errorCode)) { |
| // Copy the indexes for surrogate code points into the BMP range |
| // for simple access across the entire BMP. |
| uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT), |
| trie.index+UTRIE_BMP_INDEX_LENGTH, |
| (0x800>>UTRIE_SHIFT)*2); |
| latin1=UTRIE_GET8_LATIN1(&trie); |
| } |
| |
| restSet.remove(0, 0xffff); |
| } |
| |
| ~TrieSet() { |
| uprv_free(trieData); |
| delete restSet; |
| } |
| |
| UBool contains(UChar32 c) const { |
| if((uint32_t)c<=0xff) { |
| return (UBool)latin1[c]; |
| } else if((uint32_t)c<0xffff) { |
| return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c); |
| } else { |
| return restSet->contains(c); |
| } |
| } |
| |
| private: |
| uint32_t *trieData; |
| const uint8_t *latin1; |
| UTrie trie; |
| UnicodeSet *restSet; |
| }; |