]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/i18n/collationweights.cpp
ICU-64260.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / collationweights.cpp
index 17c044f8e92d068d86066f61480b20ed7f17795e..05458962c6d9598c7342d6d08ab066d0c9569081 100644 (file)
@@ -1,12 +1,14 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
 /*  
 *******************************************************************************
 *
-*   Copyright (C) 1999-2014, International Business Machines
+*   Copyright (C) 1999-2015, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
 *   file name:  collationweights.cpp
-*   encoding:   US-ASCII
+*   encoding:   UTF-8
 *   tab size:   8 (not used)
 *   indentation:4
 *
@@ -126,7 +128,7 @@ CollationWeights::initForSecondary() {
     maxBytes[1] = 0;
     minBytes[2] = 0;
     maxBytes[2] = 0;
-    minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1;
+    minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
     maxBytes[3] = 0xff;
     minBytes[4] = 2;
     maxBytes[4] = 0xff;
@@ -142,7 +144,7 @@ CollationWeights::initForTertiary() {
     maxBytes[2] = 0;
     // We use only 6 bits per byte.
     // The other bits are used for case & quaternary weights.
-    minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1;
+    minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
     maxBytes[3] = 0x3f;
     minBytes[4] = 2;
     maxBytes[4] = 0x3f;
@@ -296,24 +298,49 @@ CollationWeights::getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit) {
         middle.count=(int32_t)((middle.end-middle.start)>>(8*(4-middleLength)))+1;
     } else {
         /* no middle range, eliminate overlaps */
-
-        /* reduce or remove the lower ranges that go beyond upperLimit */
         for(int32_t length=4; length>middleLength; --length) {
             if(lower[length].count>0 && upper[length].count>0) {
-                uint32_t start=upper[length].start;
-                uint32_t end=lower[length].end;
-
-                if(end>=start || incWeight(end, length)==start) {
-                    /* lower and upper ranges collide or are directly adjacent: merge these two and remove all shorter ranges */
-                    start=lower[length].start;
-                    end=lower[length].end=upper[length].end;
-                    /*
-                     * merging directly adjacent ranges needs to subtract the 0/1 gaps in between;
-                     * it may result in a range with count>countBytes
-                     */
+                // Note: The lowerEnd and upperStart weights are versions of
+                // lowerLimit and upperLimit (which are lowerLimit<upperLimit),
+                // truncated (still less-or-equal)
+                // and then with their last bytes changed to the
+                // maxByte (for lowerEnd) or minByte (for upperStart).
+                const uint32_t lowerEnd=lower[length].end;
+                const uint32_t upperStart=upper[length].start;
+                UBool merged=FALSE;
+
+                if(lowerEnd>upperStart) {
+                    // These two lower and upper ranges collide.
+                    // Since lowerLimit<upperLimit and lowerEnd and upperStart
+                    // are versions with only their last bytes modified
+                    // (and following ones removed/reset to 0),
+                    // lowerEnd>upperStart is only possible
+                    // if the leading bytes are equal
+                    // and lastByte(lowerEnd)>lastByte(upperStart).
+                    U_ASSERT(truncateWeight(lowerEnd, length-1)==
+                            truncateWeight(upperStart, length-1));
+                    // Intersect these two ranges.
+                    lower[length].end=upper[length].end;
                     lower[length].count=
-                        (int32_t)(getWeightTrail(end, length)-getWeightTrail(start, length)+1+
-                                  countBytes(length)*(getWeightByte(end, length-1)-getWeightByte(start, length-1)));
+                            (int32_t)getWeightTrail(lower[length].end, length)-
+                            (int32_t)getWeightTrail(lower[length].start, length)+1;
+                    // count might be <=0 in which case there is no room,
+                    // and the range-collecting code below will ignore this range.
+                    merged=TRUE;
+                } else if(lowerEnd==upperStart) {
+                    // Not possible, unless minByte==maxByte which is not allowed.
+                    U_ASSERT(minBytes[length]<maxBytes[length]);
+                } else /* lowerEnd<upperStart */ {
+                    if(incWeight(lowerEnd, length)==upperStart) {
+                        // Merge adjacent ranges.
+                        lower[length].end=upper[length].end;
+                        lower[length].count+=upper[length].count;  // might be >countBytes
+                        merged=TRUE;
+                    }
+                }
+                if(merged) {
+                    // Remove all shorter ranges.
+                    // There was no room available for them between the ranges we just merged.
                     upper[length].count=0;
                     while(--length>middleLength) {
                         lower[length].count=upper[length].count=0;
@@ -500,7 +527,7 @@ CollationWeights::allocWeights(uint32_t lowerLimit, uint32_t upperLimit, int32_t
 #ifdef UCOL_DEBUG
         printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
 #endif
-        for(int32_t i=0; ranges[i].length==minLength; ++i) {
+        for(int32_t i=0; i<rangeCount && ranges[i].length==minLength; ++i) {
             lengthenRange(ranges[i]);
         }
     }