# HG changeset patch # User Andre Bargull # Date 1544795326 28800 # Fri Dec 14 05:48:46 2018 -0800 # Node ID 6ac79113d5382e8c21d850686f3f3c89e3ebc81d # Parent 381c646a99e33ee49518341adfa882d7b9817732 Bug 1513934 - Import ICU patch to fix possible time zone misdetection on Windows 7. r=Waldo diff --git a/intl/icu-patches/bug-1513934-timezone-detection-win7-part1.diff b/intl/icu-patches/bug-1513934-timezone-detection-win7-part1.diff new file mode 100644 --- /dev/null +++ b/intl/icu-patches/bug-1513934-timezone-detection-win7-part1.diff @@ -0,0 +1,42 @@ +From 9a2c52d1744abaa57defc5f2fb25927ae16a3a0e Mon Sep 17 00:00:00 2001 +From: Jeff Genovy <29107334+jefgen@users.noreply.github.com> +Date: Wed, 12 Dec 2018 19:42:48 -0800 +Subject: [PATCH] ICU-20302 Timezone detection fails on Windows 7. Also add a + test case for Windows time zone detection failing. + +--- + icu4c/source/common/wintz.cpp | 6 +++--- + icu4c/source/test/cintltst/putiltst.c | 11 +++++++++++ + 2 files changed, 14 insertions(+), 3 deletions(-) + +diff --git a/intl/icu/source/common/wintz.cpp b/intl/icu/source/common/wintz.cpp +index 5e9ac0d2f37..8a143d9e782 100644 +--- a/intl/icu/source/common/wintz.cpp ++++ b/intl/icu/source/common/wintz.cpp +@@ -35,7 +35,7 @@ + + U_NAMESPACE_BEGIN + +-// The value of MAX_TIMEZONE_ID_LENGTH is 128, which is defined in DYNAMIC_TIME_ZONE_INFORMATION ++// The max size of TimeZoneKeyName is 128, defined in DYNAMIC_TIME_ZONE_INFORMATION + #define MAX_TIMEZONE_ID_LENGTH 128 + + /** +@@ -44,7 +44,7 @@ U_NAMESPACE_BEGIN + * Note: We use the Win32 API GetDynamicTimeZoneInformation to get the current time zone info. + * This API returns a non-localized time zone name, which we can then map to an ICU time zone name. + */ +-U_CFUNC const char* U_EXPORT2 ++U_INTERNAL const char* U_EXPORT2 + uprv_detectWindowsTimeZone() + { + UErrorCode status = U_ZERO_ERROR; +@@ -79,7 +79,7 @@ uprv_detectWindowsTimeZone() + + // convert from wchar_t* (UTF-16 on Windows) to char* (UTF-8). + u_strToUTF8(dynamicTZKeyName, UPRV_LENGTHOF(dynamicTZKeyName), nullptr, +- reinterpret_cast(dynamicTZI.TimeZoneKeyName), UPRV_LENGTHOF(dynamicTZI.TimeZoneKeyName), &status); ++ reinterpret_cast(dynamicTZI.TimeZoneKeyName), -1, &status); + + if (U_FAILURE(status)) { + return nullptr; diff --git a/intl/icu-patches/bug-1513934-timezone-detection-win7-part2.diff b/intl/icu-patches/bug-1513934-timezone-detection-win7-part2.diff new file mode 100644 --- /dev/null +++ b/intl/icu-patches/bug-1513934-timezone-detection-win7-part2.diff @@ -0,0 +1,22 @@ +From 3c644c62c71c890424ef5d20caa2f9dc354e02d6 Mon Sep 17 00:00:00 2001 +From: Jeff Genovy +Date: Fri, 14 Dec 2018 00:56:51 -0800 +Subject: [PATCH] ICU-20302 Fix wintz header file. (Thanks to Jungshik). + +--- + icu4c/source/common/wintz.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/intl/icu/source/common/wintz.h b/intl/icu/source/common/wintz.h +index f98b1779b5d..cd8565eef1e 100644 +--- a/intl/icu/source/common/wintz.h ++++ b/intl/icu/source/common/wintz.h +@@ -28,7 +28,7 @@ U_CDECL_BEGIN + typedef struct _TIME_ZONE_INFORMATION TIME_ZONE_INFORMATION; + U_CDECL_END + +-U_CFUNC const char* U_EXPORT2 ++U_INTERNAL const char* U_EXPORT2 + uprv_detectWindowsTimeZone(); + + #endif /* U_PLATFORM_USES_ONLY_WIN32_API */ diff --git a/intl/icu/GIT-INFO b/intl/icu/GIT-INFO --- a/intl/icu/GIT-INFO +++ b/intl/icu/GIT-INFO @@ -1,7 +1,14 @@ -commit 6cbd62e59e30f73b444be89ea71fd74275ac53a4 -Author: Shane Carr -Date: Mon Oct 29 23:52:44 2018 -0700 +commit f3fa0d604ef6527a01dab96f4bfa3c5290127337 +Author: Markus Scherer +Date: Fri Nov 9 12:54:22 2018 -0800 - ICU-20246 Fixing another integer overflow in number parsing. + ICU-20250 make UnicodeSet(intprop=value) faster + - fastpath for UnicodeSet.add(new last range) + - fewer UnicodeSet memory allocations: + initial internal list array, exponential array growth, + allocate strings list/set only when first one is added + - faster CodePointTrie.getRange(): fewer calls to filter function + - revert UnicodeSet(intprop=value) from trie ranges to range starts + lookup + - cache per-int-prop range starts: fewer lookups - (cherry picked from commit 53d8c8f3d181d87a6aa925b449b51c4a2c922a51) + (cherry picked from commit 98f9170004c29388d756a8a283573164a7a26bef) diff --git a/intl/icu/source/common/characterproperties.cpp b/intl/icu/source/common/characterproperties.cpp --- a/intl/icu/source/common/characterproperties.cpp +++ b/intl/icu/source/common/characterproperties.cpp @@ -18,28 +18,33 @@ #include "normalizer2impl.h" #include "uassert.h" #include "ubidi_props.h" #include "ucase.h" #include "ucln_cmn.h" #include "umutex.h" #include "uprops.h" +using icu::LocalPointer; +using icu::Normalizer2Factory; +using icu::Normalizer2Impl; using icu::UInitOnce; using icu::UnicodeSet; namespace { UBool U_CALLCONV characterproperties_cleanup(); +constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START; + struct Inclusion { UnicodeSet *fSet; UInitOnce fInitOnce; }; -Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions() +Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions() UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {}; UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {}; UMutex cpMutex = U_MUTEX_INITIALIZER; //---------------------------------------------------------------- @@ -75,53 +80,39 @@ UBool U_CALLCONV characterproperties_cle } for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) { ucptrie_close(reinterpret_cast(maps[i])); maps[i] = nullptr; } return TRUE; } -} // namespace - -U_NAMESPACE_BEGIN - -/* -Reduce excessive reallocation, and make it easier to detect initialization problems. -Usually you don't see smaller sets than this for Unicode 5.0. -*/ -constexpr int32_t DEFAULT_INCLUSION_CAPACITY = 3072; - -void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCode &errorCode) { +void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) { // This function is invoked only via umtx_initOnce(). - // This function is a friend of class UnicodeSet. - U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT); if (src == UPROPS_SRC_NONE) { errorCode = U_INTERNAL_PROGRAM_ERROR; return; } - UnicodeSet * &incl = gInclusions[src].fSet; - U_ASSERT(incl == nullptr); + U_ASSERT(gInclusions[src].fSet == nullptr); - incl = new UnicodeSet(); - if (incl == nullptr) { + LocalPointer incl(new UnicodeSet()); + if (incl.isNull()) { errorCode = U_MEMORY_ALLOCATION_ERROR; return; } USetAdder sa = { - (USet *)incl, + (USet *)incl.getAlias(), _set_add, _set_addRange, _set_addString, nullptr, // don't need remove() nullptr // don't need removeRange() }; - incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, errorCode); switch(src) { case UPROPS_SRC_CHAR: uchar_addPropertyStarts(&sa, &errorCode); break; case UPROPS_SRC_PROPSVEC: upropsvec_addPropertyStarts(&sa, &errorCode); break; case UPROPS_SRC_CHAR_AND_PROPSVEC: @@ -178,50 +169,104 @@ void U_CALLCONV CharacterProperties::ini uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode); break; default: errorCode = U_INTERNAL_PROGRAM_ERROR; break; } if (U_FAILURE(errorCode)) { - delete incl; - incl = nullptr; return; } - // Compact for caching + if (incl->isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + // Compact for caching. incl->compact(); + gInclusions[src].fSet = incl.orphan(); ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup); } const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return nullptr; } if (src < 0 || UPROPS_SRC_COUNT <= src) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return nullptr; } Inclusion &i = gInclusions[src]; - umtx_initOnce(i.fInitOnce, &CharacterProperties::initInclusion, src, errorCode); + umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode); return i.fSet; } +void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) { + // This function is invoked only via umtx_initOnce(). + U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT); + int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START; + U_ASSERT(gInclusions[inclIndex].fSet == nullptr); + UPropertySource src = uprops_getSource(prop); + const UnicodeSet *incl = getInclusionsForSource(src, errorCode); + if (U_FAILURE(errorCode)) { + return; + } + + LocalPointer intPropIncl(new UnicodeSet(0, 0)); + if (intPropIncl.isNull()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + int32_t numRanges = incl->getRangeCount(); + int32_t prevValue = 0; + for (int32_t i = 0; i < numRanges; ++i) { + UChar32 rangeEnd = incl->getRangeEnd(i); + for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) { + // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch. + int32_t value = u_getIntPropertyValue(c, prop); + if (value != prevValue) { + intPropIncl->add(c); + prevValue = value; + } + } + } + + if (intPropIncl->isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + // Compact for caching. + intPropIncl->compact(); + gInclusions[inclIndex].fSet = intPropIncl.orphan(); + ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup); +} + +} // namespace + +U_NAMESPACE_BEGIN + const UnicodeSet *CharacterProperties::getInclusionsForProperty( UProperty prop, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return nullptr; } - UPropertySource src = uprops_getSource(prop); - return getInclusionsForSource(src, errorCode); + if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { + int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START; + Inclusion &i = gInclusions[inclIndex]; + umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode); + return i.fSet; + } else { + UPropertySource src = uprops_getSource(prop); + return getInclusionsForSource(src, errorCode); + } } U_NAMESPACE_END namespace { UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return nullptr; } - icu::LocalPointer set(new UnicodeSet()); + LocalPointer set(new UnicodeSet()); if (set.isNull()) { errorCode = U_MEMORY_ALLOCATION_ERROR; return nullptr; } const UnicodeSet *inclusions = icu::CharacterProperties::getInclusionsForProperty(property, errorCode); if (U_FAILURE(errorCode)) { return nullptr; } int32_t numRanges = inclusions->getRangeCount(); diff --git a/intl/icu/source/common/ucptrie.cpp b/intl/icu/source/common/ucptrie.cpp --- a/intl/icu/source/common/ucptrie.cpp +++ b/intl/icu/source/common/ucptrie.cpp @@ -275,17 +275,17 @@ UChar32 getRange(const void *t, UChar32 uint32_t nullValue = trie->nullValue; if (filter != nullptr) { nullValue = filter(context, nullValue); } const uint16_t *index = trie->index; int32_t prevI3Block = -1; int32_t prevBlock = -1; UChar32 c = start; - uint32_t value; + uint32_t trieValue, value; bool haveValue = false; do { int32_t i3Block; int32_t i3; int32_t i3BlockLength; int32_t dataBlockLength; if (c <= 0xffff && (trie->type == UCPTRIE_TYPE_FAST || c <= UCPTRIE_SMALL_MAX)) { i3Block = 0; @@ -314,16 +314,17 @@ UChar32 getRange(const void *t, UChar32 prevI3Block = i3Block; if (i3Block == trie->index3NullOffset) { // This is the index-3 null block. if (haveValue) { if (nullValue != value) { return c - 1; } } else { + trieValue = trie->nullValue; value = nullValue; if (pValue != nullptr) { *pValue = nullValue; } haveValue = true; } prevBlock = trie->dataNullOffset; c = (c + UCPTRIE_CP_PER_INDEX_2_ENTRY) & ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1); continue; } @@ -352,40 +353,50 @@ UChar32 getRange(const void *t, UChar32 prevBlock = block; if (block == trie->dataNullOffset) { // This is the data null block. if (haveValue) { if (nullValue != value) { return c - 1; } } else { + trieValue = trie->nullValue; value = nullValue; if (pValue != nullptr) { *pValue = nullValue; } haveValue = true; } c = (c + dataBlockLength) & ~dataMask; } else { int32_t di = block + (c & dataMask); - uint32_t value2 = getValue(trie->data, valueWidth, di); - value2 = maybeFilterValue(value2, trie->nullValue, nullValue, - filter, context); + uint32_t trieValue2 = getValue(trie->data, valueWidth, di); if (haveValue) { - if (value2 != value) { - return c - 1; + if (trieValue2 != trieValue) { + if (filter == nullptr || + maybeFilterValue(trieValue2, trie->nullValue, nullValue, + filter, context) != value) { + return c - 1; + } + trieValue = trieValue2; // may or may not help } } else { - value = value2; + trieValue = trieValue2; + value = maybeFilterValue(trieValue2, trie->nullValue, nullValue, + filter, context); if (pValue != nullptr) { *pValue = value; } haveValue = true; } while ((++c & dataMask) != 0) { - if (maybeFilterValue(getValue(trie->data, valueWidth, ++di), - trie->nullValue, nullValue, - filter, context) != value) { - return c - 1; + trieValue2 = getValue(trie->data, valueWidth, ++di); + if (trieValue2 != trieValue) { + if (filter == nullptr || + maybeFilterValue(trieValue2, trie->nullValue, nullValue, + filter, context) != value) { + return c - 1; + } + trieValue = trieValue2; // may or may not help } } } } } while (++i3 < i3BlockLength); } while (c < trie->highStart); U_ASSERT(haveValue); int32_t di = trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET; diff --git a/intl/icu/source/common/umutablecptrie.cpp b/intl/icu/source/common/umutablecptrie.cpp --- a/intl/icu/source/common/umutablecptrie.cpp +++ b/intl/icu/source/common/umutablecptrie.cpp @@ -55,16 +55,17 @@ constexpr int32_t MAX_DATA_LENGTH = UNIC constexpr uint8_t I3_NULL = 0; constexpr uint8_t I3_BMP = 1; constexpr uint8_t I3_16 = 2; constexpr uint8_t I3_18 = 3; constexpr int32_t INDEX_3_18BIT_BLOCK_LENGTH = UCPTRIE_INDEX_3_BLOCK_LENGTH + UCPTRIE_INDEX_3_BLOCK_LENGTH / 8; class AllSameBlocks; +class MixedBlocks; class MutableCodePointTrie : public UMemory { public: MutableCodePointTrie(uint32_t initialValue, uint32_t errorValue, UErrorCode &errorCode); MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode); MutableCodePointTrie(const MutableCodePointTrie &other) = delete; ~MutableCodePointTrie(); @@ -87,18 +88,20 @@ private: bool ensureHighStart(UChar32 c); int32_t allocDataBlock(int32_t blockLength); int32_t getDataBlock(int32_t i); void maskValues(uint32_t mask); UChar32 findHighStart() const; int32_t compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks); - int32_t compactData(int32_t fastILimit, uint32_t *newData, int32_t dataNullIndex); - int32_t compactIndex(int32_t fastILimit, UErrorCode &errorCode); + int32_t compactData( + int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity, + int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode); + int32_t compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, UErrorCode &errorCode); int32_t compactTrie(int32_t fastILimit, UErrorCode &errorCode); uint32_t *index = nullptr; int32_t indexCapacity = 0; int32_t index3NullOffset = -1; uint32_t *data = nullptr; int32_t dataCapacity = 0; int32_t dataLength = 0; @@ -296,51 +299,66 @@ UChar32 MutableCodePointTrie::getRange( if (filter != nullptr) { value = filter(context, value); } *pValue = value; } return MAX_UNICODE; } uint32_t nullValue = initialValue; if (filter != nullptr) { nullValue = filter(context, nullValue); } UChar32 c = start; - uint32_t value; + uint32_t trieValue, value; bool haveValue = false; int32_t i = c >> UCPTRIE_SHIFT_3; do { if (flags[i] == ALL_SAME) { - uint32_t value2 = maybeFilterValue(index[i], initialValue, nullValue, - filter, context); + uint32_t trieValue2 = index[i]; if (haveValue) { - if (value2 != value) { - return c - 1; + if (trieValue2 != trieValue) { + if (filter == nullptr || + maybeFilterValue(trieValue2, initialValue, nullValue, + filter, context) != value) { + return c - 1; + } + trieValue = trieValue2; // may or may not help } } else { - value = value2; + trieValue = trieValue2; + value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); if (pValue != nullptr) { *pValue = value; } haveValue = true; } c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK; } else /* MIXED */ { int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK); - uint32_t value2 = maybeFilterValue(data[di], initialValue, nullValue, - filter, context); + uint32_t trieValue2 = data[di]; if (haveValue) { - if (value2 != value) { - return c - 1; + if (trieValue2 != trieValue) { + if (filter == nullptr || + maybeFilterValue(trieValue2, initialValue, nullValue, + filter, context) != value) { + return c - 1; + } + trieValue = trieValue2; // may or may not help } } else { - value = value2; + trieValue = trieValue2; + value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); if (pValue != nullptr) { *pValue = value; } haveValue = true; } while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) { - if (maybeFilterValue(data[++di], initialValue, nullValue, - filter, context) != value) { - return c - 1; + trieValue2 = data[++di]; + if (trieValue2 != trieValue) { + if (filter == nullptr || + maybeFilterValue(trieValue2, initialValue, nullValue, + filter, context) != value) { + return c - 1; + } } + trieValue = trieValue2; // may or may not help } } ++i; } while (c < highStart); U_ASSERT(haveValue); if (maybeFilterValue(highValue, initialValue, nullValue, filter, context) != value) { return c - 1; @@ -543,83 +561,33 @@ void MutableCodePointTrie::maskValues(ui index[i] &= mask; } } for (int32_t i = 0; i < dataLength; ++i) { data[i] &= mask; } } -inline bool -equalBlocks(const uint32_t *s, const uint32_t *t, int32_t length) { - while (length > 0 && *s == *t) { - ++s; - ++t; - --length; - } - return length == 0; -} - -inline bool -equalBlocks(const uint16_t *s, const uint32_t *t, int32_t length) { - while (length > 0 && *s == *t) { - ++s; - ++t; - --length; - } - return length == 0; -} - -inline bool -equalBlocks(const uint16_t *s, const uint16_t *t, int32_t length) { +template +bool equalBlocks(const UIntA *s, const UIntB *t, int32_t length) { while (length > 0 && *s == *t) { ++s; ++t; --length; } return length == 0; } bool allValuesSameAs(const uint32_t *p, int32_t length, uint32_t value) { const uint32_t *pLimit = p + length; while (p < pLimit && *p == value) { ++p; } return p == pLimit; } /** Search for an identical block. */ -int32_t findSameBlock(const uint32_t *p, int32_t pStart, int32_t length, - const uint32_t *q, int32_t qStart, int32_t blockLength) { - // Ensure that we do not even partially get past length. - length -= blockLength; - - q += qStart; - while (pStart <= length) { - if (equalBlocks(p + pStart, q, blockLength)) { - return pStart; - } - ++pStart; - } - return -1; -} - -int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length, - const uint32_t *q, int32_t qStart, int32_t blockLength) { - // Ensure that we do not even partially get past length. - length -= blockLength; - - q += qStart; - while (pStart <= length) { - if (equalBlocks(p + pStart, q, blockLength)) { - return pStart; - } - ++pStart; - } - return -1; -} - int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length, const uint16_t *q, int32_t qStart, int32_t blockLength) { // Ensure that we do not even partially get past length. length -= blockLength; q += qStart; while (pStart <= length) { if (equalBlocks(p + pStart, q, blockLength)) { @@ -650,40 +618,19 @@ int32_t findAllSameBlock(const uint32_t } return -1; } /** * Look for maximum overlap of the beginning of the other block * with the previous, adjacent block. */ -int32_t getOverlap(const uint32_t *p, int32_t length, - const uint32_t *q, int32_t qStart, int32_t blockLength) { - int32_t overlap = blockLength - 1; - U_ASSERT(overlap <= length); - q += qStart; - while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) { - --overlap; - } - return overlap; -} - -int32_t getOverlap(const uint16_t *p, int32_t length, - const uint32_t *q, int32_t qStart, int32_t blockLength) { - int32_t overlap = blockLength - 1; - U_ASSERT(overlap <= length); - q += qStart; - while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) { - --overlap; - } - return overlap; -} - -int32_t getOverlap(const uint16_t *p, int32_t length, - const uint16_t *q, int32_t qStart, int32_t blockLength) { +template +int32_t getOverlap(const UIntA *p, int32_t length, + const UIntB *q, int32_t qStart, int32_t blockLength) { int32_t overlap = blockLength - 1; U_ASSERT(overlap <= length); q += qStart; while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) { --overlap; } return overlap; } @@ -802,16 +749,181 @@ private: int32_t length; int32_t mostRecent; int32_t indexes[CAPACITY]; uint32_t values[CAPACITY]; int32_t refCounts[CAPACITY]; }; +// Custom hash table for mixed-value blocks to be found anywhere in the +// compacted data or index so far. +class MixedBlocks { +public: + MixedBlocks() {} + ~MixedBlocks() { + uprv_free(table); + } + + bool init(int32_t maxLength, int32_t newBlockLength) { + // We store actual data indexes + 1 to reserve 0 for empty entries. + int32_t maxDataIndex = maxLength - newBlockLength + 1; + int32_t newLength; + if (maxDataIndex <= 0xfff) { // 4k + newLength = 6007; + shift = 12; + mask = 0xfff; + } else if (maxDataIndex <= 0x7fff) { // 32k + newLength = 50021; + shift = 15; + mask = 0x7fff; + } else if (maxDataIndex <= 0x1ffff) { // 128k + newLength = 200003; + shift = 17; + mask = 0x1ffff; + } else { + // maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M + newLength = 1500007; + shift = 21; + mask = 0x1fffff; + } + if (newLength > capacity) { + uprv_free(table); + table = (uint32_t *)uprv_malloc(newLength * 4); + if (table == nullptr) { + return false; + } + capacity = newLength; + } + length = newLength; + uprv_memset(table, 0, length * 4); + + blockLength = newBlockLength; + return true; + } + + template + void extend(const UInt *data, int32_t minStart, int32_t prevDataLength, int32_t newDataLength) { + int32_t start = prevDataLength - blockLength; + if (start >= minStart) { + ++start; // Skip the last block that we added last time. + } else { + start = minStart; // Begin with the first full block. + } + for (int32_t end = newDataLength - blockLength; start <= end; ++start) { + uint32_t hashCode = makeHashCode(data, start); + addEntry(data, start, hashCode, start); + } + } + + template + int32_t findBlock(const UIntA *data, const UIntB *blockData, int32_t blockStart) const { + uint32_t hashCode = makeHashCode(blockData, blockStart); + int32_t entryIndex = findEntry(data, blockData, blockStart, hashCode); + if (entryIndex >= 0) { + return (table[entryIndex] & mask) - 1; + } else { + return -1; + } + } + + int32_t findAllSameBlock(const uint32_t *data, uint32_t blockValue) const { + uint32_t hashCode = makeHashCode(blockValue); + int32_t entryIndex = findEntry(data, blockValue, hashCode); + if (entryIndex >= 0) { + return (table[entryIndex] & mask) - 1; + } else { + return -1; + } + } + +private: + template + uint32_t makeHashCode(const UInt *blockData, int32_t blockStart) const { + int32_t blockLimit = blockStart + blockLength; + uint32_t hashCode = blockData[blockStart++]; + do { + hashCode = 37 * hashCode + blockData[blockStart++]; + } while (blockStart < blockLimit); + return hashCode; + } + + uint32_t makeHashCode(uint32_t blockValue) const { + uint32_t hashCode = blockValue; + for (int32_t i = 1; i < blockLength; ++i) { + hashCode = 37 * hashCode + blockValue; + } + return hashCode; + } + + template + void addEntry(const UInt *data, int32_t blockStart, uint32_t hashCode, int32_t dataIndex) { + U_ASSERT(0 <= dataIndex && dataIndex < (int32_t)mask); + int32_t entryIndex = findEntry(data, data, blockStart, hashCode); + if (entryIndex < 0) { + table[~entryIndex] = (hashCode << shift) | (dataIndex + 1); + } + } + + template + int32_t findEntry(const UIntA *data, const UIntB *blockData, int32_t blockStart, + uint32_t hashCode) const { + uint32_t shiftedHashCode = hashCode << shift; + int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1 + for (int32_t entryIndex = initialEntryIndex;;) { + uint32_t entry = table[entryIndex]; + if (entry == 0) { + return ~entryIndex; + } + if ((entry & ~mask) == shiftedHashCode) { + int32_t dataIndex = (entry & mask) - 1; + if (equalBlocks(data + dataIndex, blockData + blockStart, blockLength)) { + return entryIndex; + } + } + entryIndex = nextIndex(initialEntryIndex, entryIndex); + } + } + + int32_t findEntry(const uint32_t *data, uint32_t blockValue, uint32_t hashCode) const { + uint32_t shiftedHashCode = hashCode << shift; + int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1 + for (int32_t entryIndex = initialEntryIndex;;) { + uint32_t entry = table[entryIndex]; + if (entry == 0) { + return ~entryIndex; + } + if ((entry & ~mask) == shiftedHashCode) { + int32_t dataIndex = (entry & mask) - 1; + if (allValuesSameAs(data + dataIndex, blockLength, blockValue)) { + return entryIndex; + } + } + entryIndex = nextIndex(initialEntryIndex, entryIndex); + } + } + + inline int32_t nextIndex(int32_t initialEntryIndex, int32_t entryIndex) const { + // U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length); + return (entryIndex + initialEntryIndex) % length; + } + + // Hash table. + // The length is a prime number, larger than the maximum data length. + // The "shift" lower bits store a data index + 1. + // The remaining upper bits store a partial hashCode of the block data values. + uint32_t *table = nullptr; + int32_t capacity = 0; + int32_t length = 0; + int32_t shift = 0; + uint32_t mask = 0; + + int32_t blockLength = 0; +}; + int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks) { #ifdef UCPTRIE_DEBUG bool overflow = false; #endif // ASCII data will be stored as a linear table, even if the following code // does not yet count it that way. int32_t newDataCapacity = ASCII_LIMIT; @@ -957,18 +1069,19 @@ void printBlock(const uint32_t *block, i * The compaction * - removes blocks that are identical with earlier ones * - overlaps each new non-duplicate block as much as possible with the previously-written one * - works with fast-range data blocks whose length is a multiple of that of * higher-code-point data blocks * * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks. */ -int32_t MutableCodePointTrie::compactData(int32_t fastILimit, - uint32_t *newData, int32_t dataNullIndex) { +int32_t MutableCodePointTrie::compactData( + int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity, + int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode) { #ifdef UCPTRIE_DEBUG int32_t countSame=0, sumOverlaps=0; bool printData = dataLength == 29088 /* line.brk */ || // dataLength == 30048 /* CanonIterData */ || dataLength == 50400 /* zh.txt~stroke */; #endif // The linear ASCII data has been copied into newData already. @@ -978,95 +1091,109 @@ int32_t MutableCodePointTrie::compactDat index[i] = newDataLength; #ifdef UCPTRIE_DEBUG if (printData) { printBlock(newData + newDataLength, UCPTRIE_FAST_DATA_BLOCK_LENGTH, 0, newDataLength, 0, initialValue); } #endif } + int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH; + if (!mixedBlocks.init(newDataCapacity, blockLength)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + mixedBlocks.extend(newData, 0, 0, newDataLength); + int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; - int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH; int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; int32_t fastLength = 0; for (int32_t i = ASCII_I_LIMIT; i < iLimit; i += inc) { if (i == fastILimit) { blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH; inc = 1; fastLength = newDataLength; + if (!mixedBlocks.init(newDataCapacity, blockLength)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + mixedBlocks.extend(newData, 0, 0, newDataLength); } if (flags[i] == ALL_SAME) { uint32_t value = index[i]; - int32_t n; // Find an earlier part of the data array of length blockLength // that is filled with this value. + int32_t n = mixedBlocks.findAllSameBlock(newData, value); // If we find a match, and the current block is the data null block, // and it is not a fast block but matches the start of a fast block, // then we need to continue looking. // This is because this small block is shorter than the fast block, // and not all of the rest of the fast block is filled with this value. // Otherwise trie.getRange() would detect that the fast block starts at // dataNullOffset and assume incorrectly that it is filled with the null value. - for (int32_t start = 0; - (n = findAllSameBlock(newData, start, newDataLength, - value, blockLength)) >= 0 && - i == dataNullIndex && i >= fastILimit && n < fastLength && - isStartOfSomeFastBlock(n, index, fastILimit); - start = n + 1) {} + while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength && + isStartOfSomeFastBlock(n, index, fastILimit)) { + n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength); + } if (n >= 0) { DEBUG_DO(++countSame); index[i] = n; } else { n = getAllSameOverlap(newData, newDataLength, value, blockLength); DEBUG_DO(sumOverlaps += n); #ifdef UCPTRIE_DEBUG if (printData) { printBlock(nullptr, blockLength, value, i << UCPTRIE_SHIFT_3, n, initialValue); } #endif index[i] = newDataLength - n; + int32_t prevDataLength = newDataLength; while (n < blockLength) { newData[newDataLength++] = value; ++n; } + mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); } } else if (flags[i] == MIXED) { const uint32_t *block = data + index[i]; - int32_t n = findSameBlock(newData, 0, newDataLength, block, 0, blockLength); + int32_t n = mixedBlocks.findBlock(newData, block, 0); if (n >= 0) { DEBUG_DO(++countSame); index[i] = n; } else { n = getOverlap(newData, newDataLength, block, 0, blockLength); DEBUG_DO(sumOverlaps += n); #ifdef UCPTRIE_DEBUG if (printData) { printBlock(block, blockLength, 0, i << UCPTRIE_SHIFT_3, n, initialValue); } #endif index[i] = newDataLength - n; + int32_t prevDataLength = newDataLength; while (n < blockLength) { newData[newDataLength++] = block[n++]; } + mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); } } else /* SAME_AS */ { uint32_t j = index[i]; index[i] = index[j]; } } #ifdef UCPTRIE_DEBUG /* we saved some space */ printf("compacting UCPTrie: count of 32-bit data words %lu->%lu countSame=%ld sumOverlaps=%ld\n", (long)dataLength, (long)newDataLength, (long)countSame, (long)sumOverlaps); #endif return newDataLength; } -int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &errorCode) { +int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, + UErrorCode &errorCode) { int32_t fastIndexLength = fastILimit >> (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3); if ((highStart >> UCPTRIE_FAST_SHIFT) <= fastIndexLength) { // Only the linear fast index, no multi-stage index tables. index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET; return fastIndexLength; } // Condense the fast index table. @@ -1090,26 +1217,33 @@ int32_t MutableCodePointTrie::compactInd // Needed when the multi-stage index covers the fast index range as well. int32_t iNext = i + SMALL_DATA_BLOCKS_PER_BMP_BLOCK; while (++i < iNext) { i3 += UCPTRIE_SMALL_DATA_BLOCK_LENGTH; index[i] = i3; } } + if (!mixedBlocks.init(fastIndexLength, UCPTRIE_INDEX_3_BLOCK_LENGTH)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength); + // Examine index-3 blocks. For each determine one of: // - same as the index-3 null block // - same as a fast-index block // - 16-bit indexes // - 18-bit indexes // We store this in the first flags entry for the index-3 block. // // Also determine an upper limit for the index-3 table length. int32_t index3Capacity = 0; i3FirstNull = index3NullOffset; + bool hasLongI3Blocks = false; // If the fast index covers the whole BMP, then // the multi-stage index is only for supplementary code points. // Otherwise, the multi-stage index covers all of Unicode. int32_t iStart = fastILimit < BMP_I_LIMIT ? 0 : BMP_I_LIMIT; int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; for (int32_t i = iStart; i < iLimit;) { int32_t j = i; int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH; @@ -1124,33 +1258,34 @@ int32_t MutableCodePointTrie::compactInd } while (++j < jLimit); if (isNull) { flags[i] = I3_NULL; if (i3FirstNull < 0) { if (oredI3 <= 0xffff) { index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH; } else { index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; + hasLongI3Blocks = true; } i3FirstNull = 0; } } else { if (oredI3 <= 0xffff) { - int32_t n = findSameBlock(fastIndex, 0, fastIndexLength, - index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH); + int32_t n = mixedBlocks.findBlock(fastIndex, index, i); if (n >= 0) { flags[i] = I3_BMP; index[i] = n; } else { flags[i] = I3_16; index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH; } } else { flags[i] = I3_18; index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; + hasLongI3Blocks = true; } } i = j; } int32_t index2Capacity = (iLimit - iStart) >> UCPTRIE_SHIFT_2_3; // Length of the index-1 table, rounded up. @@ -1161,16 +1296,28 @@ int32_t MutableCodePointTrie::compactInd int32_t index16Capacity = fastIndexLength + index1Length + index3Capacity + index2Capacity + 1; index16 = (uint16_t *)uprv_malloc(index16Capacity * 2); if (index16 == nullptr) { errorCode = U_MEMORY_ALLOCATION_ERROR; return 0; } uprv_memcpy(index16, fastIndex, fastIndexLength * 2); + if (!mixedBlocks.init(index16Capacity, UCPTRIE_INDEX_3_BLOCK_LENGTH)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + MixedBlocks longI3Blocks; + if (hasLongI3Blocks) { + if (!longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + } + // Compact the index-3 table and write an uncompacted version of the index-2 table. uint16_t index2[UNICODE_LIMIT >> UCPTRIE_SHIFT_2]; // index2Capacity int32_t i2Length = 0; i3FirstNull = index3NullOffset; int32_t index3Start = fastIndexLength + index1Length; int32_t indexLength = index3Start; for (int32_t i = iStart; i < iLimit; i += UCPTRIE_INDEX_3_BLOCK_LENGTH) { int32_t i3; @@ -1180,35 +1327,40 @@ int32_t MutableCodePointTrie::compactInd f = dataNullOffset <= 0xffff ? I3_16 : I3_18; i3FirstNull = 0; } if (f == I3_NULL) { i3 = index3NullOffset; } else if (f == I3_BMP) { i3 = index[i]; } else if (f == I3_16) { - int32_t n = findSameBlock(index16, index3Start, indexLength, - index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH); + int32_t n = mixedBlocks.findBlock(index16, index, i); if (n >= 0) { i3 = n; } else { if (indexLength == index3Start) { // No overlap at the boundary between the index-1 and index-3 tables. n = 0; } else { n = getOverlap(index16, indexLength, index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH); } i3 = indexLength - n; + int32_t prevIndexLength = indexLength; while (n < UCPTRIE_INDEX_3_BLOCK_LENGTH) { index16[indexLength++] = index[i + n++]; } + mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); + if (hasLongI3Blocks) { + longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); + } } } else { U_ASSERT(f == I3_18); + U_ASSERT(hasLongI3Blocks); // Encode an index-3 block that contains one or more data indexes exceeding 16 bits. int32_t j = i; int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH; int32_t k = indexLength; do { ++k; uint32_t v = index[j++]; uint32_t upperBits = (v & 0x30000) >> 2; @@ -1231,37 +1383,41 @@ int32_t MutableCodePointTrie::compactInd v = index[j++]; upperBits |= (v & 0x30000) >> 14; index16[k++] = v; v = index[j++]; upperBits |= (v & 0x30000) >> 16; index16[k++] = v; index16[k - 9] = upperBits; } while (j < jLimit); - int32_t n = findSameBlock(index16, index3Start, indexLength, - index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH); + int32_t n = longI3Blocks.findBlock(index16, index16, indexLength); if (n >= 0) { i3 = n | 0x8000; } else { if (indexLength == index3Start) { // No overlap at the boundary between the index-1 and index-3 tables. n = 0; } else { n = getOverlap(index16, indexLength, index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH); } i3 = (indexLength - n) | 0x8000; + int32_t prevIndexLength = indexLength; if (n > 0) { int32_t start = indexLength; while (n < INDEX_3_18BIT_BLOCK_LENGTH) { index16[indexLength++] = index16[start + n++]; } } else { indexLength += INDEX_3_18BIT_BLOCK_LENGTH; } + mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); + if (hasLongI3Blocks) { + longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); + } } } if (index3NullOffset < 0 && i3FirstNull >= 0) { index3NullOffset = i3; } // Set the index-2 table entry. index2[i2Length++] = i3; } @@ -1274,39 +1430,48 @@ int32_t MutableCodePointTrie::compactInd if (indexLength >= (UCPTRIE_NO_INDEX3_NULL_OFFSET + UCPTRIE_INDEX_3_BLOCK_LENGTH)) { // The index-3 offsets exceed 15 bits, or // the last one cannot be distinguished from the no-null-block value. errorCode = U_INDEX_OUTOFBOUNDS_ERROR; return 0; } // Compact the index-2 table and write the index-1 table. + static_assert(UCPTRIE_INDEX_2_BLOCK_LENGTH == UCPTRIE_INDEX_3_BLOCK_LENGTH, + "must re-init mixedBlocks"); int32_t blockLength = UCPTRIE_INDEX_2_BLOCK_LENGTH; int32_t i1 = fastIndexLength; for (int32_t i = 0; i < i2Length; i += blockLength) { - if ((i2Length - i) < blockLength) { + int32_t n; + if ((i2Length - i) >= blockLength) { + // normal block + U_ASSERT(blockLength == UCPTRIE_INDEX_2_BLOCK_LENGTH); + n = mixedBlocks.findBlock(index16, index2, i); + } else { // highStart is inside the last index-2 block. Shorten it. blockLength = i2Length - i; + n = findSameBlock(index16, index3Start, indexLength, + index2, i, blockLength); } int32_t i2; - int32_t n = findSameBlock(index16, index3Start, indexLength, - index2, i, blockLength); if (n >= 0) { i2 = n; } else { if (indexLength == index3Start) { // No overlap at the boundary between the index-1 and index-3/2 tables. n = 0; } else { n = getOverlap(index16, indexLength, index2, i, blockLength); } i2 = indexLength - n; + int32_t prevIndexLength = indexLength; while (n < blockLength) { index16[indexLength++] = index2[i + n++]; } + mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); } // Set the index-1 table entry. index16[i1++] = i2; } U_ASSERT(i1 == index3Start); U_ASSERT(indexLength <= index16Capacity); #ifdef UCPTRIE_DEBUG @@ -1364,17 +1529,21 @@ int32_t MutableCodePointTrie::compactTri uint32_t *newData = (uint32_t *)uprv_malloc(newDataCapacity * 4); if (newData == nullptr) { errorCode = U_MEMORY_ALLOCATION_ERROR; return 0; } uprv_memcpy(newData, asciiData, sizeof(asciiData)); int32_t dataNullIndex = allSameBlocks.findMostUsed(); - int32_t newDataLength = compactData(fastILimit, newData, dataNullIndex); + + MixedBlocks mixedBlocks; + int32_t newDataLength = compactData(fastILimit, newData, newDataCapacity, + dataNullIndex, mixedBlocks, errorCode); + if (U_FAILURE(errorCode)) { return 0; } U_ASSERT(newDataLength <= newDataCapacity); uprv_free(data); data = newData; dataCapacity = newDataCapacity; dataLength = newDataLength; if (dataLength > (0x3ffff + UCPTRIE_SMALL_DATA_BLOCK_LENGTH)) { // The offset of the last data block is too high to be stored in the index table. errorCode = U_INDEX_OUTOFBOUNDS_ERROR; @@ -1389,17 +1558,17 @@ int32_t MutableCodePointTrie::compactTri (long)initialValue, (long)data[dataNullOffset]); } #endif initialValue = data[dataNullOffset]; } else { dataNullOffset = UCPTRIE_NO_DATA_NULL_OFFSET; } - int32_t indexLength = compactIndex(fastILimit, errorCode); + int32_t indexLength = compactIndex(fastILimit, mixedBlocks, errorCode); highStart = realHighStart; return indexLength; } UCPTrie *MutableCodePointTrie::build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return nullptr; } diff --git a/intl/icu/source/common/unicode/uniset.h b/intl/icu/source/common/unicode/uniset.h --- a/intl/icu/source/common/unicode/uniset.h +++ b/intl/icu/source/common/unicode/uniset.h @@ -22,17 +22,16 @@ * \file * \brief C++ API: Unicode Set */ U_NAMESPACE_BEGIN // Forward Declarations. class BMPSet; -class CharacterProperties; class ParsePosition; class RBBIRuleScanner; class SymbolTable; class UnicodeSetStringSpan; class UVector; class RuleCharacterIterator; /** @@ -271,43 +270,56 @@ class RuleCharacterIterator; * the use of default parameter values. * Instead, such methods set the UnicodeSet into a "bogus" state * (see isBogus()) if an error occurs. * * @author Alan Liu * @stable ICU 2.0 */ class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter { +private: + /** + * Enough for sets with few ranges. + * For example, White_Space has 10 ranges, list length 21. + */ + static constexpr int32_t INITIAL_CAPACITY = 25; + // fFlags constant + static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid) - int32_t len; // length of list used; 0 <= len <= capacity - int32_t capacity; // capacity of list - UChar32* list; // MUST be terminated with HIGH - BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL. - UChar32* buffer; // internal buffer, may be NULL - int32_t bufferCapacity; // capacity of buffer - int32_t patLen; + UChar32* list = stackList; // MUST be terminated with HIGH + int32_t capacity = INITIAL_CAPACITY; // capacity of list + int32_t len = 1; // length of list used; 1 <= len <= capacity + uint8_t fFlags = 0; // Bit flag (see constants above) + + BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not NULL. + UChar32* buffer = nullptr; // internal buffer, may be NULL + int32_t bufferCapacity = 0; // capacity of buffer /** * The pattern representation of this set. This may not be the * most economical pattern. It is the pattern supplied to * applyPattern(), with variables substituted and whitespace * removed. For sets constructed without applyPattern(), or * modified using the non-pattern API, this string will be empty, * indicating that toPattern() must generate a pattern * representation from the inversion list. */ - char16_t *pat; - UVector* strings; // maintained in sorted order - UnicodeSetStringSpan *stringSpan; + char16_t *pat = nullptr; + int32_t patLen = 0; + + UVector* strings = nullptr; // maintained in sorted order + UnicodeSetStringSpan *stringSpan = nullptr; -private: - enum { // constants - kIsBogus = 1 // This set is bogus (i.e. not valid) - }; - uint8_t fFlags; // Bit flag (see constants above) + /** + * Initial list array. + * Avoids some heap allocations, and list is never nullptr. + * Increases the object size a bit. + */ + UChar32 stackList[INITIAL_CAPACITY]; + public: /** * Determine if this object contains a valid set. * A bogus set has no value. It is different from an empty set. * It can be used to indicate that no set value is available. * * @return TRUE if the set is bogus/invalid, FALSE otherwise * @see setToBogus() @@ -1475,18 +1487,16 @@ public: virtual UClassID getDynamicClassID(void) const; private: // Private API for the USet API friend class USetAccess; - int32_t getStringCount() const; - const UnicodeString* getString(int32_t index) const; //---------------------------------------------------------------- // RuleBasedTransliterator support //---------------------------------------------------------------- private: @@ -1523,23 +1533,28 @@ private: UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, UErrorCode& ec); //---------------------------------------------------------------- // Implementation: Utility methods //---------------------------------------------------------------- - void ensureCapacity(int32_t newLen, UErrorCode& ec); + static int32_t nextCapacity(int32_t minCapacity); - void ensureBufferCapacity(int32_t newLen, UErrorCode& ec); + bool ensureCapacity(int32_t newLen); + + bool ensureBufferCapacity(int32_t newLen); void swapBuffers(void); UBool allocateStrings(UErrorCode &status); + UBool hasStrings() const; + int32_t stringsSize() const; + UBool stringsContains(const UnicodeString &s) const; UnicodeString& _toPattern(UnicodeString& result, UBool escapeUnprintable) const; UnicodeString& _generatePattern(UnicodeString& result, UBool escapeUnprintable) const; static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable); @@ -1609,17 +1624,16 @@ private: UnicodeSet& applyPropertyPattern(const UnicodeString& pattern, ParsePosition& ppos, UErrorCode &ec); void applyPropertyPattern(RuleCharacterIterator& chars, UnicodeString& rebuiltPat, UErrorCode& ec); - friend class CharacterProperties; static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status); /** * A filter that returns TRUE if the given code point should be * included in the UnicodeSet being constructed. */ typedef UBool (*Filter)(UChar32 codePoint, void* context); @@ -1641,17 +1655,20 @@ private: void applyIntPropertyValue(const UCPMap *map, UCPMapValueFilter *filter, const void *context, UErrorCode &errorCode); #endif /* U_HIDE_DRAFT_API */ /** * Set the new pattern to cache. */ - void setPattern(const UnicodeString& newPat); + void setPattern(const UnicodeString& newPat) { + setPattern(newPat.getBuffer(), newPat.length()); + } + void setPattern(const char16_t *newPat, int32_t newPatLen); /** * Release existing cached pattern. */ void releasePattern(); friend class UnicodeSetIterator; }; diff --git a/intl/icu/source/common/uniset.cpp b/intl/icu/source/common/uniset.cpp --- a/intl/icu/source/common/uniset.cpp +++ b/intl/icu/source/common/uniset.cpp @@ -9,16 +9,17 @@ * 10/20/99 alan Creation. ********************************************************************** */ #include "unicode/utypes.h" #include "unicode/parsepos.h" #include "unicode/symtable.h" #include "unicode/uniset.h" +#include "unicode/ustring.h" #include "unicode/utf8.h" #include "unicode/utf16.h" #include "ruleiter.h" #include "cmemory.h" #include "cstring.h" #include "patternprops.h" #include "uelement.h" #include "util.h" @@ -48,21 +49,18 @@ #define EQUALS ((UChar)0x003D) /*=*/ // HIGH_VALUE > all valid values. 110000 for codepoints #define UNICODESET_HIGH 0x0110000 // LOW <= all valid values. ZERO for codepoints #define UNICODESET_LOW 0x000000 -// initial storage. Must be >= 0 -#define START_EXTRA 16 - -// extra amount for growth. Must be >= 0 -#define GROW_EXTRA START_EXTRA +/** Max list [0, 1, 2, ..., max code point, HIGH] */ +constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1; U_NAMESPACE_BEGIN SymbolTable::~SymbolTable() {} UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet) /** @@ -132,144 +130,92 @@ static void U_CALLCONV cloneUnicodeStrin } static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { const UnicodeString &a = *(const UnicodeString*)t1.pointer; const UnicodeString &b = *(const UnicodeString*)t2.pointer; return a.compare(b); } +UBool UnicodeSet::hasStrings() const { + return strings != nullptr && !strings->isEmpty(); +} + +int32_t UnicodeSet::stringsSize() const { + return strings == nullptr ? 0 : strings->size(); +} + +UBool UnicodeSet::stringsContains(const UnicodeString &s) const { + return strings != nullptr && strings->contains((void*) &s); +} + //---------------------------------------------------------------- // Constructors &c //---------------------------------------------------------------- /** * Constructs an empty set. */ -UnicodeSet::UnicodeSet() : - len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - UErrorCode status = U_ZERO_ERROR; - allocateStrings(status); - if (U_FAILURE(status)) { - setToBogus(); // If memory allocation failed, set to bogus state. - return; - } - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(list!=NULL){ - list[0] = UNICODESET_HIGH; - } else { // If memory allocation failed, set to bogus state. - setToBogus(); - return; - } +UnicodeSet::UnicodeSet() { + list[0] = UNICODESET_HIGH; _dbgct(this); } /** * Constructs a set containing the given range. If end > * start then an empty set is created. * * @param start first character, inclusive, of range * @param end last character, inclusive, of range */ -UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) : - len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - UErrorCode status = U_ZERO_ERROR; - allocateStrings(status); - if (U_FAILURE(status)) { - setToBogus(); // If memory allocation failed, set to bogus state. - return; - } - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(list!=NULL){ - list[0] = UNICODESET_HIGH; - complement(start, end); - } else { // If memory allocation failed, set to bogus state. - setToBogus(); - return; - } +UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) { + list[0] = UNICODESET_HIGH; + add(start, end); _dbgct(this); } /** * Constructs a set that is identical to the given UnicodeSet. */ -UnicodeSet::UnicodeSet(const UnicodeSet& o) : - UnicodeFilter(o), - len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0), - bmpSet(0), - buffer(0), bufferCapacity(0), - patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - UErrorCode status = U_ZERO_ERROR; - allocateStrings(status); - if (U_FAILURE(status)) { - setToBogus(); // If memory allocation failed, set to bogus state. - return; - } - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(list!=NULL){ - *this = o; - } else { // If memory allocation failed, set to bogus state. - setToBogus(); - return; - } +UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) { + *this = o; _dbgct(this); } // Copy-construct as thawed. -UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : - UnicodeFilter(o), - len(0), capacity(o.len + GROW_EXTRA), list(0), - bmpSet(0), - buffer(0), bufferCapacity(0), - patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - UErrorCode status = U_ZERO_ERROR; - allocateStrings(status); - if (U_FAILURE(status)) { - setToBogus(); // If memory allocation failed, set to bogus state. - return; - } - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(list!=NULL){ +UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) { + if (ensureCapacity(o.len)) { // *this = o except for bmpSet and stringSpan len = o.len; uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); - if (strings != NULL && o.strings != NULL) { - strings->assign(*o.strings, cloneUnicodeString, status); - } else { // Invalid strings. - setToBogus(); - return; + if (o.hasStrings()) { + UErrorCode status = U_ZERO_ERROR; + if (!allocateStrings(status) || + (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { + setToBogus(); + return; + } } if (o.pat) { - setPattern(UnicodeString(o.pat, o.patLen)); + setPattern(o.pat, o.patLen); } - } else { // If memory allocation failed, set to bogus state. - setToBogus(); - return; + _dbgct(this); } - _dbgct(this); } /** * Destructs the set. */ UnicodeSet::~UnicodeSet() { _dbgdt(this); // first! - uprv_free(list); + if (list != stackList) { + uprv_free(list); + } delete bmpSet; - if (buffer) { + if (buffer != stackList) { uprv_free(buffer); } delete strings; delete stringSpan; releasePattern(); } /** @@ -285,51 +231,49 @@ UnicodeSet& UnicodeSet::copyFrom(const U } if (isFrozen()) { return *this; } if (o.isBogus()) { setToBogus(); return *this; } - UErrorCode ec = U_ZERO_ERROR; - ensureCapacity(o.len, ec); - if (U_FAILURE(ec)) { + if (!ensureCapacity(o.len)) { // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens. return *this; } len = o.len; uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); - if (o.bmpSet == NULL || asThawed) { - bmpSet = NULL; - } else { + if (o.bmpSet != nullptr && !asThawed) { bmpSet = new BMPSet(*o.bmpSet, list, len); if (bmpSet == NULL) { // Check for memory allocation error. setToBogus(); return *this; } } - if (strings != NULL && o.strings != NULL) { - strings->assign(*o.strings, cloneUnicodeString, ec); - } else { // Invalid strings. - setToBogus(); - return *this; + if (o.hasStrings()) { + UErrorCode status = U_ZERO_ERROR; + if ((strings == nullptr && !allocateStrings(status)) || + (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { + setToBogus(); + return *this; + } + } else if (hasStrings()) { + strings->removeAllElements(); } - if (o.stringSpan == NULL || asThawed) { - stringSpan = NULL; - } else { + if (o.stringSpan != nullptr && !asThawed) { stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings); if (stringSpan == NULL) { // Check for memory allocation error. setToBogus(); return *this; } } releasePattern(); if (o.pat) { - setPattern(UnicodeString(o.pat, o.patLen)); + setPattern(o.pat, o.patLen); } return *this; } /** * Returns a copy of this object. All UnicodeMatcher objects have * to support cloning in order to allow classes using * UnicodeMatchers, such as Transliterator, to implement cloning. @@ -352,17 +296,18 @@ UnicodeFunctor *UnicodeSet::cloneAsThawe * @param o set to be compared for equality with this set. * @return true if the specified set is equal to this set. */ UBool UnicodeSet::operator==(const UnicodeSet& o) const { if (len != o.len) return FALSE; for (int32_t i = 0; i < len; ++i) { if (list[i] != o.list[i]) return FALSE; } - if (*strings != *o.strings) return FALSE; + if (hasStrings() != o.hasStrings()) { return FALSE; } + if (hasStrings() && *strings != *o.strings) return FALSE; return TRUE; } /** * Returns the hash code value for this set. * * @return the hash code value for this set. * @see Object#hashCode() @@ -388,26 +333,26 @@ int32_t UnicodeSet::hashCode(void) const * @return the number of elements in this set (its cardinality). */ int32_t UnicodeSet::size(void) const { int32_t n = 0; int32_t count = getRangeCount(); for (int32_t i = 0; i < count; ++i) { n += getRangeEnd(i) - getRangeStart(i) + 1; } - return n + strings->size(); + return n + stringsSize(); } /** * Returns true if this set contains no elements. * * @return true if this set contains no elements. */ UBool UnicodeSet::isEmpty(void) const { - return len == 1 && strings->size() == 0; + return len == 1 && !hasStrings(); } /** * Returns true if this set contains the given character. * @param c character to be checked for containment * @return true if the test condition is met */ UBool UnicodeSet::contains(UChar32 c) const { @@ -497,17 +442,17 @@ UBool UnicodeSet::contains(UChar32 start * multicharacter string. * @param s string to be checked for containment * @return true if this set contains the specified string */ UBool UnicodeSet::contains(const UnicodeString& s) const { if (s.length() == 0) return FALSE; int32_t cp = getSingleCP(s); if (cp < 0) { - return strings->contains((void*) &s); + return stringsContains(s); } else { return contains((UChar32) cp); } } /** * Returns true if this set contains all the characters and strings * of the given set. @@ -519,18 +464,17 @@ UBool UnicodeSet::containsAll(const Unic // this set. It's possible to code this more efficiently in terms of // direct manipulation of the inversion lists if the need arises. int32_t n = c.getRangeCount(); for (int i=0; icontainsAll(*c.strings)) return FALSE; - return TRUE; + return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings)); } /** * Returns true if this set contains all the characters * of the given string. * @param s string containing characters to be checked for containment * @return true if the test condition is met */ @@ -566,18 +510,17 @@ UBool UnicodeSet::containsNone(const Uni // this set. It's possible to code this more efficiently in terms of // direct manipulation of the inversion lists if the need arises. int32_t n = c.getRangeCount(); for (int32_t i=0; icontainsNone(*c.strings)) return FALSE; - return TRUE; + return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings); } /** * Returns true if this set contains none of the characters * of the given string. * @param s string containing characters to be checked for containment * @return true if the test condition is met */ @@ -608,17 +551,17 @@ UBool UnicodeSet::matchesIndexValue(uint if ((low & ~0xFF) == (high & ~0xFF)) { if ((low & 0xFF) <= v && v <= (high & 0xFF)) { return TRUE; } } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) { return TRUE; } } - if (strings->size() != 0) { + if (hasStrings()) { for (i=0; isize(); ++i) { const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i); //if (s.length() == 0) { // // Empty strings match everything // return TRUE; //} // assert(s.length() != 0); // We enforce this elsewhere UChar32 c = s.char32At(0); @@ -643,17 +586,17 @@ UMatchDegree UnicodeSet::matches(const R // about them here. If we ever allow zero-length strings // we much check for them here. if (contains(U_ETHER)) { return incremental ? U_PARTIAL_MATCH : U_MATCH; } else { return U_MISMATCH; } } else { - if (strings->size() != 0) { // try strings first + if (hasStrings()) { // try strings first // might separate forward and backward loops later // for now they are combined // TODO Improve efficiency of this, at least in the forward // direction, if not in both. In the forward direction we // can assume the strings are sorted. @@ -844,17 +787,49 @@ UnicodeSet& UnicodeSet::set(UChar32 star * * @param start first character, inclusive, of range to be added * to this set. * @param end last character, inclusive, of range to be added * to this set. */ UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { if (pinCodePoint(start) < pinCodePoint(end)) { - UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; + UChar32 limit = end + 1; + // Fast path for adding a new range after the last one. + // Odd list length: [..., lastStart, lastLimit, HIGH] + if ((len & 1) != 0) { + // If the list is empty, set lastLimit low enough to not be adjacent to 0. + UChar32 lastLimit = len == 1 ? -2 : list[len - 2]; + if (lastLimit <= start && !isFrozen() && !isBogus()) { + if (lastLimit == start) { + // Extend the last range. + list[len - 2] = limit; + if (limit == UNICODESET_HIGH) { + --len; + } + } else { + list[len - 1] = start; + if (limit < UNICODESET_HIGH) { + if (ensureCapacity(len + 2)) { + list[len++] = limit; + list[len++] = UNICODESET_HIGH; + } + } else { // limit == UNICODESET_HIGH + if (ensureCapacity(len + 1)) { + list[len++] = UNICODESET_HIGH; + } + } + } + releasePattern(); + return *this; + } + } + // This is slow. Could be much faster using findCodePoint(start) + // and modifying the list, dealing with adjacent & overlapping ranges. + UChar32 range[3] = { start, limit, UNICODESET_HIGH }; add(range, 2, 0); } else if (start == end) { add(start); } return *this; } // #define DEBUG_US_ADD @@ -913,19 +888,17 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { printf(" => "); #endif if (c == list[i]-1) { // c is before start of next range list[i] = c; // if we touched the HIGH mark, then add a new one if (c == (UNICODESET_HIGH - 1)) { - UErrorCode status = U_ZERO_ERROR; - ensureCapacity(len+1, status); - if (U_FAILURE(status)) { + if (!ensureCapacity(len+1)) { // ensureCapacity will mark the object as Bogus if OOM failure happens. return *this; } list[len++] = UNICODESET_HIGH; } if (i > 0 && c == list[i-1]) { // collapse adjacent ranges @@ -959,31 +932,23 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] // ^ // list[i] // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] // ^ // list[i] - UErrorCode status = U_ZERO_ERROR; - ensureCapacity(len+2, status); - if (U_FAILURE(status)) { + if (!ensureCapacity(len+2)) { // ensureCapacity will mark the object as Bogus if OOM failure happens. return *this; } - //for (int32_t k=len-1; k>=i; --k) { - // list[k+2] = list[k]; - //} - UChar32* src = list + len; - UChar32* dst = src + 2; - UChar32* srclimit = list + i; - while (src > srclimit) *(--dst) = *(--src); - + UChar32 *p = list + i; + uprv_memmove(p + 2, p, (len - i) * sizeof(*p)); list[i] = c; list[i+1] = c+1; len += 2; } #ifdef DEBUG_US_ADD dump(list, len); printf("\n"); @@ -1009,17 +974,17 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { *
Warning: you cannot add an empty string ("") to a UnicodeSet. * @param s the source string * @return the modified set, for chaining */ UnicodeSet& UnicodeSet::add(const UnicodeString& s) { if (s.length() == 0 || isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { - if (!strings->contains((void*) &s)) { + if (!stringsContains(s)) { _add(s); releasePattern(); } } else { add((UChar32)cp); } return *this; } @@ -1028,22 +993,26 @@ UnicodeSet& UnicodeSet::add(const Unicod * Adds the given string, in order, to 'strings'. The given string * must have been checked by the caller to not be empty and to not * already be in 'strings'. */ void UnicodeSet::_add(const UnicodeString& s) { if (isFrozen() || isBogus()) { return; } + UErrorCode ec = U_ZERO_ERROR; + if (strings == nullptr && !allocateStrings(ec)) { + setToBogus(); + return; + } UnicodeString* t = new UnicodeString(s); if (t == NULL) { // Check for memory allocation error. setToBogus(); return; } - UErrorCode ec = U_ZERO_ERROR; strings->sortedInsert(t, compareUnicodeString, ec); if (U_FAILURE(ec)) { setToBogus(); delete t; } } /** @@ -1116,17 +1085,20 @@ UnicodeSet& UnicodeSet::complementAll(co UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) { UnicodeSet set; set.addAll(s); removeAll(set); return *this; } UnicodeSet& UnicodeSet::removeAllStrings() { - strings->removeAllElements(); + if (!isFrozen() && hasStrings()) { + strings->removeAllElements(); + releasePattern(); + } return *this; } /** * Makes a set from a multicharacter string. Thus "ch" => {"ch"} *
Warning: you cannot add an empty string ("") to a UnicodeSet. * @param the source string @@ -1212,18 +1184,19 @@ UnicodeSet& UnicodeSet::remove(UChar32 c * returns. * @param the source string * @return the modified set, for chaining */ UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { if (s.length() == 0 || isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { - strings->removeElement((void*) &s); - releasePattern(); + if (strings != nullptr && strings->removeElement((void*) &s)) { + releasePattern(); + } } else { remove((UChar32)cp, (UChar32)cp); } return *this; } /** * Complements the specified range in this set. Any character in @@ -1255,51 +1228,44 @@ UnicodeSet& UnicodeSet::complement(UChar /** * This is equivalent to * complement(MIN_VALUE, MAX_VALUE). */ UnicodeSet& UnicodeSet::complement(void) { if (isFrozen() || isBogus()) { return *this; } - UErrorCode status = U_ZERO_ERROR; if (list[0] == UNICODESET_LOW) { - ensureBufferCapacity(len-1, status); - if (U_FAILURE(status)) { - return *this; - } - uprv_memcpy(buffer, list + 1, (size_t)(len-1)*sizeof(UChar32)); + uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32)); --len; } else { - ensureBufferCapacity(len+1, status); - if (U_FAILURE(status)) { + if (!ensureCapacity(len+1)) { return *this; } - uprv_memcpy(buffer + 1, list, (size_t)len*sizeof(UChar32)); - buffer[0] = UNICODESET_LOW; + uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32)); + list[0] = UNICODESET_LOW; ++len; } - swapBuffers(); releasePattern(); return *this; } /** * Complement the specified string in this set. * The set will not contain the specified string once the call * returns. *
Warning: you cannot add an empty string ("") to a UnicodeSet. * @param s the string to complement * @return this object, for chaining */ UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { if (s.length() == 0 || isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { - if (strings->contains((void*) &s)) { + if (stringsContains(s)) { strings->removeElement((void*) &s); } else { _add(s); } releasePattern(); } else { complement((UChar32)cp, (UChar32)cp); } @@ -1320,17 +1286,17 @@ UnicodeSet& UnicodeSet::addAll(const Uni if ( c.len>0 && c.list!=NULL ) { add(c.list, c.len, 0); } // Add strings in order if ( c.strings!=NULL ) { for (int32_t i=0; isize(); ++i) { const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i); - if (!strings->contains((void*) s)) { + if (!stringsContains(*s)) { _add(*s); } } } return *this; } /** @@ -1342,17 +1308,23 @@ UnicodeSet& UnicodeSet::addAll(const Uni * * @param c set that defines which elements this set will retain. */ UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { if (isFrozen() || isBogus()) { return *this; } retain(c.list, c.len, 0); - strings->retainAll(*c.strings); + if (hasStrings()) { + if (!c.hasStrings()) { + strings->removeAllElements(); + } else { + strings->retainAll(*c.strings); + } + } return *this; } /** * Removes from this set all of its elements that are contained in the * specified set. This operation effectively modifies this * set so that its value is the asymmetric set difference of * the two sets. @@ -1360,17 +1332,19 @@ UnicodeSet& UnicodeSet::retainAll(const * @param c set that defines which elements will be removed from * this set. */ UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { if (isFrozen() || isBogus()) { return *this; } retain(c.list, c.len, 2); - strings->removeAll(*c.strings); + if (hasStrings() && c.hasStrings()) { + strings->removeAll(*c.strings); + } return *this; } /** * Complements in this set all elements contained in the specified * set. Any character in the other set will be removed if it is * in this set, or will be added if it is not in this set. * @@ -1378,45 +1352,43 @@ UnicodeSet& UnicodeSet::removeAll(const * this set. */ UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { if (isFrozen() || isBogus()) { return *this; } exclusiveOr(c.list, c.len, 0); - for (int32_t i=0; isize(); ++i) { - void* e = c.strings->elementAt(i); - if (!strings->removeElement(e)) { - _add(*(const UnicodeString*)e); + if (c.strings != nullptr) { + for (int32_t i=0; isize(); ++i) { + void* e = c.strings->elementAt(i); + if (strings == nullptr || !strings->removeElement(e)) { + _add(*(const UnicodeString*)e); + } } } return *this; } /** * Removes all of the elements from this set. This set will be * empty after this call returns. */ UnicodeSet& UnicodeSet::clear(void) { if (isFrozen()) { return *this; } - if (list != NULL) { - list[0] = UNICODESET_HIGH; - } + list[0] = UNICODESET_HIGH; len = 1; releasePattern(); if (strings != NULL) { strings->removeAllElements(); } - if (list != NULL && strings != NULL) { - // Remove bogus - fFlags = 0; - } + // Remove bogus + fFlags = 0; return *this; } /** * Iteration method that returns the number of ranges contained in * this set. * @see #getRangeStart * @see #getRangeEnd @@ -1440,115 +1412,113 @@ UChar32 UnicodeSet::getRangeStart(int32_ * specified range of this set. * @see #getRangeStart * @see #getRangeEnd */ UChar32 UnicodeSet::getRangeEnd(int32_t index) const { return list[index*2 + 1] - 1; } -int32_t UnicodeSet::getStringCount() const { - return strings->size(); -} - const UnicodeString* UnicodeSet::getString(int32_t index) const { return (const UnicodeString*) strings->elementAt(index); } /** * Reallocate this objects internal structures to take up the least * possible space, without changing this object's value. */ UnicodeSet& UnicodeSet::compact() { if (isFrozen() || isBogus()) { return *this; } // Delete buffer first to defragment memory less. - if (buffer != NULL) { + if (buffer != stackList) { uprv_free(buffer); buffer = NULL; + bufferCapacity = 0; } - if (len < capacity) { - // Make the capacity equal to len or 1. - // We don't want to realloc of 0 size. - int32_t newCapacity = len + (len == 0); - UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity); + if (list == stackList) { + // pass + } else if (len <= INITIAL_CAPACITY) { + uprv_memcpy(stackList, list, len * sizeof(UChar32)); + uprv_free(list); + list = stackList; + capacity = INITIAL_CAPACITY; + } else if ((len + 7) < capacity) { + // If we have more than a little unused capacity, shrink it to len. + UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len); if (temp) { list = temp; - capacity = newCapacity; + capacity = len; } // else what the heck happened?! We allocated less memory! // Oh well. We'll keep our original array. } + if (strings != nullptr && strings->isEmpty()) { + delete strings; + strings = nullptr; + } return *this; } #ifdef DEBUG_SERIALIZE #include #endif /** * Deserialize constructor. */ -UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec) - : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) { +UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, + UErrorCode &ec) { if(U_FAILURE(ec)) { setToBogus(); return; } if( (serialization != kSerialized) || (data==NULL) || (dataLen < 1)) { ec = U_ILLEGAL_ARGUMENT_ERROR; setToBogus(); return; } - allocateStrings(ec); - if (U_FAILURE(ec)) { - setToBogus(); - return; - } - // bmp? int32_t headerSize = ((data[0]&0x8000)) ?2:1; int32_t bmpLength = (headerSize==1)?data[0]:data[1]; - len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; + int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; #ifdef DEBUG_SERIALIZE - printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]); + printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]); #endif - capacity = len+1; - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(!list || U_FAILURE(ec)) { - setToBogus(); + if(!ensureCapacity(newLength + 1)) { // +1 for HIGH return; } // copy bmp int32_t i; for(i = 0; i< bmpLength;i++) { list[i] = data[i+headerSize]; #ifdef DEBUG_SERIALIZE printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]); #endif } // copy smp - for(i=bmpLength;i MAX_LENGTH) { + newCapacity = MAX_LENGTH; + } + return newCapacity; } - UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA)); +} + +bool UnicodeSet::ensureCapacity(int32_t newLen) { + if (newLen > MAX_LENGTH) { + newLen = MAX_LENGTH; + } + if (newLen <= capacity) { + return true; + } + int32_t newCapacity = nextCapacity(newLen); + UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32)); if (temp == NULL) { - ec = U_MEMORY_ALLOCATION_ERROR; setToBogus(); // set the object to bogus state if an OOM failure occurred. - return; + return false; + } + // Copy only the actual contents. + uprv_memcpy(temp, list, len * sizeof(UChar32)); + if (list != stackList) { + uprv_free(list); } list = temp; - capacity = newLen + GROW_EXTRA; - // else we keep the original contents on the memory failure. + capacity = newCapacity; + return true; } -void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) { - if (buffer != NULL && newLen <= bufferCapacity) - return; - UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA)); +bool UnicodeSet::ensureBufferCapacity(int32_t newLen) { + if (newLen > MAX_LENGTH) { + newLen = MAX_LENGTH; + } + if (newLen <= bufferCapacity) { + return true; + } + int32_t newCapacity = nextCapacity(newLen); + UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32)); if (temp == NULL) { - ec = U_MEMORY_ALLOCATION_ERROR; setToBogus(); - return; + return false; + } + // The buffer has no contents to be copied. + // It is always filled from scratch after this call. + if (buffer != stackList) { + uprv_free(buffer); } buffer = temp; - bufferCapacity = newLen + GROW_EXTRA; - // else we keep the original contents on the memory failure. + bufferCapacity = newCapacity; + return true; } /** * Swap list and buffer. */ void UnicodeSet::swapBuffers(void) { // swap list and buffer UChar32* temp = list; @@ -1722,19 +1724,17 @@ static inline UChar32 max(UChar32 a, UCh // polarity = 0, 3 is normal: x xor y // polarity = 1, 2: x xor ~y == x === y void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) { if (isFrozen() || isBogus()) { return; } - UErrorCode status = U_ZERO_ERROR; - ensureBufferCapacity(len + otherLen, status); - if (U_FAILURE(status)) { + if (!ensureBufferCapacity(len + otherLen)) { return; } int32_t i = 0, j = 0, k = 0; UChar32 a = list[i++]; UChar32 b; if (polarity == 1 || polarity == 2) { b = UNICODESET_LOW; @@ -1772,19 +1772,17 @@ void UnicodeSet::exclusiveOr(const UChar // polarity = 2: x union ~y // polarity = 1: ~x union y // polarity = 3: ~x union ~y void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) { if (isFrozen() || isBogus() || other==NULL) { return; } - UErrorCode status = U_ZERO_ERROR; - ensureBufferCapacity(len + otherLen, status); - if (U_FAILURE(status)) { + if (!ensureBufferCapacity(len + otherLen)) { return; } int32_t i = 0, j = 0, k = 0; UChar32 a = list[i++]; UChar32 b = other[j++]; // change from xor is that we have to check overlapping pairs // polarity bit 1 means a is second, bit 2 means b is. @@ -1885,19 +1883,17 @@ void UnicodeSet::add(const UChar32* othe // polarity = 2: x intersect ~y == set-minus // polarity = 1: ~x intersect y // polarity = 3: ~x intersect ~y void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) { if (isFrozen() || isBogus()) { return; } - UErrorCode status = U_ZERO_ERROR; - ensureBufferCapacity(len + otherLen, status); - if (U_FAILURE(status)) { + if (!ensureBufferCapacity(len + otherLen)) { return; } int32_t i = 0, j = 0, k = 0; UChar32 a = list[i++]; UChar32 b = other[j++]; // change from xor is that we have to check overlapping pairs // polarity bit 1 means a is second, bit 2 means b is. @@ -2133,22 +2129,24 @@ UnicodeString& UnicodeSet::_generatePatt if ((start+1) != end) { result.append(HYPHEN); } _appendToPat(result, end, escapeUnprintable); } } } - for (int32_t i = 0; isize(); ++i) { - result.append(OPEN_BRACE); - _appendToPat(result, - *(const UnicodeString*) strings->elementAt(i), - escapeUnprintable); - result.append(CLOSE_BRACE); + if (strings != nullptr) { + for (int32_t i = 0; isize(); ++i) { + result.append(OPEN_BRACE); + _appendToPat(result, + *(const UnicodeString*) strings->elementAt(i), + escapeUnprintable); + result.append(CLOSE_BRACE); + } } return result.append(SET_CLOSE); } /** * Release existing cached pattern */ void UnicodeSet::releasePattern() { @@ -2157,55 +2155,39 @@ void UnicodeSet::releasePattern() { pat = NULL; patLen = 0; } } /** * Set the new pattern to cache. */ -void UnicodeSet::setPattern(const UnicodeString& newPat) { +void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) { releasePattern(); - int32_t newPatLen = newPat.length(); pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar)); if (pat) { patLen = newPatLen; - newPat.extractBetween(0, patLen, pat); + u_memcpy(pat, newPat, patLen); pat[patLen] = 0; } // else we don't care if malloc failed. This was just a nice cache. // We can regenerate an equivalent pattern later when requested. } UnicodeFunctor *UnicodeSet::freeze() { if(!isFrozen() && !isBogus()) { - // Do most of what compact() does before freezing because - // compact() will not work when the set is frozen. - // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). + compact(); - // Delete buffer first to defragment memory less. - if (buffer != NULL) { - uprv_free(buffer); - buffer = NULL; - } - if (capacity > (len + GROW_EXTRA)) { - // Make the capacity equal to len or 1. - // We don't want to realloc of 0 size. - capacity = len + (len == 0); - list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity); - if (list == NULL) { // Check for memory allocation error. + // Optimize contains() and span() and similar functions. + if (hasStrings()) { + stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL); + if (stringSpan == nullptr) { setToBogus(); return this; - } - } - - // Optimize contains() and span() and similar functions. - if (!strings->isEmpty()) { - stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL); - if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) { + } else if (!stringSpan->needsStringSpanUTF16()) { // All strings are irrelevant for span() etc. because // all of each string's code points are contained in this set. // Do not check needsStringSpanUTF8() because UTF-8 has at most as // many relevant strings as UTF-16. // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().) delete stringSpan; stringSpan = NULL; } @@ -2228,17 +2210,17 @@ int32_t UnicodeSet::span(const UChar *s, if(length<0) { length=u_strlen(s); } if(length==0) { return 0; } if(stringSpan!=NULL) { return stringSpan->span(s, length, spanCondition); - } else if(!strings->isEmpty()) { + } else if(hasStrings()) { uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED : UnicodeSetStringSpan::FWD_UTF16_CONTAINED; UnicodeSetStringSpan strSpan(*this, *strings, which); if(strSpan.needsStringSpanUTF16()) { return strSpan.span(s, length, spanCondition); } } @@ -2265,17 +2247,17 @@ int32_t UnicodeSet::spanBack(const UChar if(length<0) { length=u_strlen(s); } if(length==0) { return 0; } if(stringSpan!=NULL) { return stringSpan->spanBack(s, length, spanCondition); - } else if(!strings->isEmpty()) { + } else if(hasStrings()) { uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED : UnicodeSetStringSpan::BACK_UTF16_CONTAINED; UnicodeSetStringSpan strSpan(*this, *strings, which); if(strSpan.needsStringSpanUTF16()) { return strSpan.spanBack(s, length, spanCondition); } } @@ -2303,17 +2285,17 @@ int32_t UnicodeSet::spanUTF8(const char if(length<0) { length=(int32_t)uprv_strlen(s); } if(length==0) { return 0; } if(stringSpan!=NULL) { return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition); - } else if(!strings->isEmpty()) { + } else if(hasStrings()) { uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED : UnicodeSetStringSpan::FWD_UTF8_CONTAINED; UnicodeSetStringSpan strSpan(*this, *strings, which); if(strSpan.needsStringSpanUTF8()) { return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition); } } @@ -2341,17 +2323,17 @@ int32_t UnicodeSet::spanBackUTF8(const c if(length<0) { length=(int32_t)uprv_strlen(s); } if(length==0) { return 0; } if(stringSpan!=NULL) { return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition); - } else if(!strings->isEmpty()) { + } else if(hasStrings()) { uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED : UnicodeSetStringSpan::BACK_UTF8_CONTAINED; UnicodeSetStringSpan strSpan(*this, *strings, which); if(strSpan.needsStringSpanUTF8()) { return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition); } } diff --git a/intl/icu/source/common/uniset_closure.cpp b/intl/icu/source/common/uniset_closure.cpp --- a/intl/icu/source/common/uniset_closure.cpp +++ b/intl/icu/source/common/uniset_closure.cpp @@ -26,70 +26,40 @@ #include "unicode/parsepos.h" #include "unicode/uniset.h" #include "cmemory.h" #include "ruleiter.h" #include "ucase.h" #include "util.h" #include "uvector.h" -// initial storage. Must be >= 0 -// *** same as in uniset.cpp ! *** -#define START_EXTRA 16 - U_NAMESPACE_BEGIN // TODO memory debugging provided inside uniset.cpp // could be made available here but probably obsolete with use of modern // memory leak checker tools #define _dbgct(me) //---------------------------------------------------------------- // Constructors &c //---------------------------------------------------------------- UnicodeSet::UnicodeSet(const UnicodeString& pattern, uint32_t options, const SymbolTable* symbols, - UErrorCode& status) : - len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - if(U_SUCCESS(status)){ - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - /* test for NULL */ - if(list == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - }else{ - allocateStrings(status); - applyPattern(pattern, options, symbols, status); - } - } + UErrorCode& status) { + applyPattern(pattern, options, symbols, status); _dbgct(this); } UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, uint32_t options, const SymbolTable* symbols, - UErrorCode& status) : - len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - if(U_SUCCESS(status)){ - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - /* test for NULL */ - if(list == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - }else{ - allocateStrings(status); - applyPattern(pattern, pos, options, symbols, status); - } - } + UErrorCode& status) { + applyPattern(pattern, pos, options, symbols, status); _dbgct(this); } //---------------------------------------------------------------- // Public API //---------------------------------------------------------------- UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, @@ -194,17 +164,17 @@ UnicodeSet& UnicodeSet::closeOver(int32_ _set_addString, NULL, // don't need remove() NULL // don't need removeRange() }; // start with input set to guarantee inclusion // USET_CASE: remove strings because the strings will actually be reduced (folded); // therefore, start with no strings and add only those needed - if (attribute & USET_CASE_INSENSITIVE) { + if ((attribute & USET_CASE_INSENSITIVE) && foldSet.hasStrings()) { foldSet.strings->removeAllElements(); } int32_t n = getRangeCount(); UChar32 result; const UChar *full; for (int32_t i=0; isize() > 0) { + if (hasStrings()) { if (attribute & USET_CASE_INSENSITIVE) { for (int32_t j=0; jsize(); ++j) { str = *(const UnicodeString *) strings->elementAt(j); str.foldCase(); if(!ucase_addStringCaseClosure(str.getBuffer(), str.length(), &sa)) { foldSet.add(str); // does not map to code points: add the folded string itself } } diff --git a/intl/icu/source/common/uniset_props.cpp b/intl/icu/source/common/uniset_props.cpp --- a/intl/icu/source/common/uniset_props.cpp +++ b/intl/icu/source/common/uniset_props.cpp @@ -42,20 +42,16 @@ #include "cstring.h" #include "mutex.h" #include "umutex.h" #include "uassert.h" #include "hash.h" U_NAMESPACE_USE -// initial storage. Must be >= 0 -// *** same as in uniset.cpp ! *** -#define START_EXTRA 16 - // Define UChar constants using hex for EBCDIC compatibility // Used #define to reduce private static exports and memory access time. #define SET_OPEN ((UChar)0x005B) /*[*/ #define SET_CLOSE ((UChar)0x005D) /*]*/ #define HYPHEN ((UChar)0x002D) /*-*/ #define COMPLEMENT ((UChar)0x005E) /*^*/ #define COLON ((UChar)0x003A) /*:*/ #define BACKSLASH ((UChar)0x005C) /*\*/ @@ -180,31 +176,18 @@ isPOSIXClose(const UnicodeString &patter /** * Constructs a set from the given pattern, optionally ignoring * white space. See the class description for the syntax of the * pattern language. * @param pattern a string specifying what characters are in the set */ UnicodeSet::UnicodeSet(const UnicodeString& pattern, - UErrorCode& status) : - len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - if(U_SUCCESS(status)){ - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - /* test for NULL */ - if(list == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - }else{ - allocateStrings(status); - applyPattern(pattern, status); - } - } + UErrorCode& status) { + applyPattern(pattern, status); _dbgct(this); } //---------------------------------------------------------------- // Public API //---------------------------------------------------------------- UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, @@ -708,24 +691,39 @@ void UnicodeSet::applyPattern(RuleCharac //---------------------------------------------------------------- namespace { static UBool numericValueFilter(UChar32 ch, void* context) { return u_getNumericValue(ch) == *(double*)context; } +static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { + int32_t value = *(int32_t*)context; + return (U_GET_GC_MASK((UChar32) ch) & value) != 0; +} + static UBool versionFilter(UChar32 ch, void* context) { static const UVersionInfo none = { 0, 0, 0, 0 }; UVersionInfo v; u_charAge(ch, v); UVersionInfo* version = (UVersionInfo*)context; return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; } +typedef struct { + UProperty prop; + int32_t value; +} IntPropertyContext; + +static UBool intPropertyFilter(UChar32 ch, void* context) { + IntPropertyContext* c = (IntPropertyContext*)context; + return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; +} + static UBool scriptExtensionsFilter(UChar32 ch, void* context) { return uscript_hasScript(ch, *(UScriptCode*)context); } } // namespace /** * Generic filter-based scanning code for UCD property UnicodeSets. @@ -776,53 +774,16 @@ void UnicodeSet::applyFilter(UnicodeSet: if (isBogus() && U_SUCCESS(status)) { // We likely ran out of memory. AHHH! status = U_MEMORY_ALLOCATION_ERROR; } } namespace { -/** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */ -uint32_t U_CALLCONV generalCategoryMaskFilter(const void *context, uint32_t value) { - uint32_t mask = *(const uint32_t *)context; - value = U_MASK(value) & mask; - if (value != 0) { value = 1; } - return value; -} - -/** Maps one map value to 1, all others to 0. */ -uint32_t U_CALLCONV intValueFilter(const void *context, uint32_t value) { - uint32_t v = *(const uint32_t *)context; - return value == v ? 1 : 0; -} - -} // namespace - -void UnicodeSet::applyIntPropertyValue(const UCPMap *map, - UCPMapValueFilter *filter, const void *context, - UErrorCode &errorCode) { - if (U_FAILURE(errorCode)) { return; } - clear(); - UChar32 start = 0, end; - uint32_t value; - while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0, - filter, context, &value)) >= 0) { - if (value != 0) { - add(start, end); - } - start = end + 1; - } - if (isBogus()) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - } -} - -namespace { - static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { /* Note: we use ' ' in compiler code page */ int32_t j = 0; char ch; --dstCapacity; /* make room for term. zero */ while ((ch = *src++) != 0) { if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { continue; @@ -840,45 +801,41 @@ static UBool mungeCharName(char* dst, co //---------------------------------------------------------------- // Property set API //---------------------------------------------------------------- #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;} UnicodeSet& UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { - if (U_FAILURE(ec)) { return *this; } - // All of the following check isFrozen() before modifying this set. + if (U_FAILURE(ec) || isFrozen()) { return *this; } if (prop == UCHAR_GENERAL_CATEGORY_MASK) { - const UCPMap *map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &ec); - applyIntPropertyValue(map, generalCategoryMaskFilter, &value, ec); + const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); + applyFilter(generalCategoryMaskFilter, &value, inclusions, ec); } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); UScriptCode script = (UScriptCode)value; applyFilter(scriptExtensionsFilter, &script, inclusions, ec); } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) { if (value == 0 || value == 1) { const USet *set = u_getBinaryPropertySet(prop, &ec); if (U_FAILURE(ec)) { return *this; } copyFrom(*UnicodeSet::fromUSet(set), TRUE); if (value == 0) { complement(); } } else { clear(); } } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { - const UCPMap *map = u_getIntPropertyMap(prop, &ec); - applyIntPropertyValue(map, intValueFilter, &value, ec); + const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); + IntPropertyContext c = {prop, value}; + applyFilter(intPropertyFilter, &c, inclusions, ec); } else { - // This code used to always call getInclusions(property source) - // which sets an error for an unsupported property. ec = U_ILLEGAL_ARGUMENT_ERROR; - // Otherwise we would just clear() this set because - // getIntPropertyValue(c, prop) returns 0 for all code points. } return *this; } UnicodeSet& UnicodeSet::applyPropertyAlias(const UnicodeString& prop, const UnicodeString& value, UErrorCode& ec) { diff --git a/intl/icu/source/common/uprops.h b/intl/icu/source/common/uprops.h --- a/intl/icu/source/common/uprops.h +++ b/intl/icu/source/common/uprops.h @@ -457,17 +457,16 @@ uchar_swapNames(const UDataSwapper *ds, U_NAMESPACE_BEGIN class UnicodeSet; class CharacterProperties { public: CharacterProperties() = delete; - static void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode); static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode); }; // implemented in uniset_props.cpp U_CFUNC UnicodeSet * uniset_getUnicode32Instance(UErrorCode &errorCode); U_NAMESPACE_END diff --git a/intl/icu/source/common/uset.cpp b/intl/icu/source/common/uset.cpp --- a/intl/icu/source/common/uset.cpp +++ b/intl/icu/source/common/uset.cpp @@ -244,17 +244,17 @@ U_NAMESPACE_BEGIN * This class only exists to provide access to the UnicodeSet private * USet support API. Declaring a class a friend is more portable than * trying to declare extern "C" functions as friends. */ class USetAccess /* not : public UObject because all methods are static */ { public: /* Try to have the compiler inline these*/ inline static int32_t getStringCount(const UnicodeSet& set) { - return set.getStringCount(); + return set.stringsSize(); } inline static const UnicodeString* getString(const UnicodeSet& set, int32_t i) { return set.getString(i); } private: /* do not instantiate*/ USetAccess(); diff --git a/intl/icu/source/common/usetiter.cpp b/intl/icu/source/common/usetiter.cpp --- a/intl/icu/source/common/usetiter.cpp +++ b/intl/icu/source/common/usetiter.cpp @@ -111,17 +111,17 @@ void UnicodeSetIterator::reset(const Uni */ void UnicodeSetIterator::reset() { if (set == NULL) { // Set up indices to empty iteration endRange = -1; stringCount = 0; } else { endRange = set->getRangeCount() - 1; - stringCount = set->strings->size(); + stringCount = set->stringsSize(); } range = 0; endElement = -1; nextElement = 0; if (endRange >= 0) { loadRange(range); } nextString = 0; diff --git a/intl/icu/source/common/wintz.cpp b/intl/icu/source/common/wintz.cpp --- a/intl/icu/source/common/wintz.cpp +++ b/intl/icu/source/common/wintz.cpp @@ -30,26 +30,26 @@ # define NOUSER # define NOSERVICE # define NOIME # define NOMCX #include U_NAMESPACE_BEGIN -// The value of MAX_TIMEZONE_ID_LENGTH is 128, which is defined in DYNAMIC_TIME_ZONE_INFORMATION +// The max size of TimeZoneKeyName is 128, defined in DYNAMIC_TIME_ZONE_INFORMATION #define MAX_TIMEZONE_ID_LENGTH 128 /** * Main Windows time zone detection function. * Returns the Windows time zone converted to an ICU time zone as a heap-allocated buffer, or nullptr upon failure. * Note: We use the Win32 API GetDynamicTimeZoneInformation to get the current time zone info. * This API returns a non-localized time zone name, which we can then map to an ICU time zone name. */ -U_CFUNC const char* U_EXPORT2 +U_INTERNAL const char* U_EXPORT2 uprv_detectWindowsTimeZone() { UErrorCode status = U_ZERO_ERROR; char* icuid = nullptr; char dynamicTZKeyName[MAX_TIMEZONE_ID_LENGTH]; char tmpid[MAX_TIMEZONE_ID_LENGTH]; int32_t len; int id = GEOID_NOT_AVAILABLE; @@ -74,17 +74,17 @@ uprv_detectWindowsTimeZone() u_strToUTF8(ISOcode, UPRV_LENGTHOF(ISOcode), nullptr, reinterpret_cast(ISOcodeW), UPRV_LENGTHOF(ISOcodeW), &status); LocalUResourceBundlePointer bundle(ures_openDirect(nullptr, "windowsZones", &status)); ures_getByKey(bundle.getAlias(), "mapTimezones", bundle.getAlias(), &status); // convert from wchar_t* (UTF-16 on Windows) to char* (UTF-8). u_strToUTF8(dynamicTZKeyName, UPRV_LENGTHOF(dynamicTZKeyName), nullptr, - reinterpret_cast(dynamicTZI.TimeZoneKeyName), UPRV_LENGTHOF(dynamicTZI.TimeZoneKeyName), &status); + reinterpret_cast(dynamicTZI.TimeZoneKeyName), -1, &status); if (U_FAILURE(status)) { return nullptr; } if (dynamicTZI.TimeZoneKeyName[0] != 0) { UResourceBundle winTZ; ures_initStackObject(&winTZ); diff --git a/intl/icu/source/common/wintz.h b/intl/icu/source/common/wintz.h --- a/intl/icu/source/common/wintz.h +++ b/intl/icu/source/common/wintz.h @@ -23,14 +23,14 @@ * \brief C API: Utilities for dealing w/ Windows time zones. */ U_CDECL_BEGIN /* Forward declarations for Windows types... */ typedef struct _TIME_ZONE_INFORMATION TIME_ZONE_INFORMATION; U_CDECL_END -U_CFUNC const char* U_EXPORT2 +U_INTERNAL const char* U_EXPORT2 uprv_detectWindowsTimeZone(); #endif /* U_PLATFORM_USES_ONLY_WIN32_API */ #endif /* __WINTZ */ diff --git a/intl/tzdata/GIT-INFO b/intl/tzdata/GIT-INFO --- a/intl/tzdata/GIT-INFO +++ b/intl/tzdata/GIT-INFO @@ -1,5 +1,5 @@ -commit 6a8e28db3cbff837570f93881e6e4f7ff4d5fb25 +commit 6e82c7c389888603f0de84ffe5c60f43f11ee844 Author: Yoshito Umaoka -Date: Tue Oct 30 08:52:31 2018 -0400 +Date: Wed Nov 7 19:23:35 2018 -0500 - ICU-20245: tzdata2018g updates. Also added tzdata2018f release files missed previously. + ICU-20260 Fix CR/LF issue diff --git a/intl/update-icu.sh b/intl/update-icu.sh --- a/intl/update-icu.sh +++ b/intl/update-icu.sh @@ -82,16 +82,18 @@ find ${icu_dir}/source/data/zone \ -name '*.txt' -print | xargs sed -i '/^\s*zoneStrings{/{N; s/^\s*zoneStrings{\n\s*}// }; /^$/d' for patch in \ bug-915735 \ suppress-warnings.diff \ bug-1172609-timezone-recreateDefault.diff \ bug-1198952-workaround-make-3.82-bug.diff \ bug-1504656-relativetimeformat-plural-other-fallback.diff \ + bug-1513934-timezone-detection-win7-part1.diff \ + bug-1513934-timezone-detection-win7-part2.diff \ ; do echo "Applying local patch $patch" patch -d ${icu_dir}/../../ -p1 --no-backup-if-mismatch < ${icu_dir}/../icu-patches/$patch done topsrcdir=`dirname $0`/../ python ${topsrcdir}/js/src/tests/non262/String/make-normalize-generateddata-input.py $topsrcdir