ICU-6.2.14.tar.gz

[apple/icu.git] / icuSources / i18n / ucol_bld.cpp
diff --git a/icuSources/i18n/ucol_bld.cpp b/icuSources/i18n/ucol_bld.cpp

index f9bc084e6650265ed02d8a0108a790abc7bac15b..cfc5d6d352556eed0cd3acecd872ab83558a4366 100644 (file)
--- a/icuSources/i18n/ucol_bld.cpp
+++ b/icuSources/i18n/ucol_bld.cpp
@@ -1,7 +1,7 @@
  /*
  *******************************************************************************
  *
  /*
  *******************************************************************************
  *
-*   Copyright (C) 2001-2003, International Business Machines
+*   Copyright (C) 2001-2004, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *
  *******************************************************************************
  *   Corporation and others.  All Rights Reserved.
  *
  *******************************************************************************
@@ -28,7 +28,7 @@
  #include "umutex.h"
  #include "unicode/uniset.h"
  
  #include "umutex.h"
  #include "unicode/uniset.h"
  
-static const InverseUCATableHeader* invUCA = NULL;
+static const InverseUCATableHeader* _staticInvUCA = NULL;
  static UDataMemory* invUCA_DATA_MEM = NULL;
  
  U_CDECL_BEGIN
  static UDataMemory* invUCA_DATA_MEM = NULL;
  
  U_CDECL_BEGIN
@@ -40,15 +40,15 @@ isAcceptableInvUCA(void * /*context*/,
      if( pInfo->size>=20 &&
          pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
          pInfo->charsetFamily==U_CHARSET_FAMILY &&
      if( pInfo->size>=20 &&
          pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
          pInfo->charsetFamily==U_CHARSET_FAMILY &&
-        pInfo->dataFormat[0]==invUcaDataInfo.dataFormat[0] &&   /* dataFormat="InvC" */
-        pInfo->dataFormat[1]==invUcaDataInfo.dataFormat[1] &&
-        pInfo->dataFormat[2]==invUcaDataInfo.dataFormat[2] &&
-        pInfo->dataFormat[3]==invUcaDataInfo.dataFormat[3] &&
-        pInfo->formatVersion[0]==invUcaDataInfo.formatVersion[0] &&
-        pInfo->formatVersion[1]>=invUcaDataInfo.formatVersion[1] //&&
-        //pInfo->formatVersion[1]==invUcaDataInfo.formatVersion[1] &&
-        //pInfo->formatVersion[2]==invUcaDataInfo.formatVersion[2] &&
-        //pInfo->formatVersion[3]==invUcaDataInfo.formatVersion[3] &&
+        pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
+        pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
+        pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
+        pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
+        pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
+        pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
+        //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
+        //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
+        //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
          ) {
          UVersionInfo UCDVersion;
          u_getUnicodeVersion(UCDVersion);
          ) {
          UVersionInfo UCDVersion;
          u_getUnicodeVersion(UCDVersion);
@@ -68,11 +68,11 @@ isAcceptableInvUCA(void * /*context*/,
  U_CDECL_END
  
  static
  U_CDECL_END
  
  static
-int32_t ucol_inv_findCE(uint32_t CE, uint32_t SecondCE) {
-  uint32_t bottom = 0, top = invUCA->tableSize;
+int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
+  uint32_t bottom = 0, top = src->invUCA->tableSize;
    uint32_t i = 0;
    uint32_t first = 0, second = 0;
    uint32_t i = 0;
    uint32_t first = 0, second = 0;
-  uint32_t *CETable = (uint32_t *)((uint8_t *)invUCA+invUCA->table);
+  uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
  
    while(bottom < top-1) {
      i = (top+bottom)/2;
  
    while(bottom < top-1) {
      i = (top+bottom)/2;
@@ -116,13 +116,14 @@ static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
    0xFFFFFFFF
  };
  
    0xFFFFFFFF
  };
  
-U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(uint32_t CE, uint32_t contCE, 
+U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
+                                            uint32_t CE, uint32_t contCE, 
                                              uint32_t *nextCE, uint32_t *nextContCE, 
                                              uint32_t strength) {
                                              uint32_t *nextCE, uint32_t *nextContCE, 
                                              uint32_t strength) {
-  uint32_t *CETable = (uint32_t *)((uint8_t *)invUCA+invUCA->table);
+  uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    int32_t iCE;
  
    int32_t iCE;
  
-  iCE = ucol_inv_findCE(CE, contCE);
+  iCE = ucol_inv_findCE(src, CE, contCE);
  
    if(iCE<0) {
      *nextCE = UCOL_NOT_FOUND;
  
    if(iCE<0) {
      *nextCE = UCOL_NOT_FOUND;
@@ -144,13 +145,14 @@ U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(uint32_t CE, uint32_t contCE,
    return iCE;
  }
  
    return iCE;
  }
  
-U_CAPI int32_t U_EXPORT2 ucol_inv_getPrevCE(uint32_t CE, uint32_t contCE, 
+U_CAPI int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src, 
+                                            uint32_t CE, uint32_t contCE, 
                                              uint32_t *prevCE, uint32_t *prevContCE, 
                                              uint32_t strength) {
                                              uint32_t *prevCE, uint32_t *prevContCE, 
                                              uint32_t strength) {
-  uint32_t *CETable = (uint32_t *)((uint8_t *)invUCA+invUCA->table);
+  uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    int32_t iCE;
  
    int32_t iCE;
  
-  iCE = ucol_inv_findCE(CE, contCE);
+  iCE = ucol_inv_findCE(src, CE, contCE);
  
    if(iCE<0) {
      *prevCE = UCOL_NOT_FOUND;
  
    if(iCE<0) {
      *prevCE = UCOL_NOT_FOUND;
@@ -174,17 +176,30 @@ U_CAPI int32_t U_EXPORT2 ucol_inv_getPrevCE(uint32_t CE, uint32_t contCE,
    return iCE;
  }
  
    return iCE;
  }
  
+U_CAPI uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE, 
+                                            uint32_t prevCE, uint32_t prevContCE) {
+    uint32_t strength = UCOL_TERTIARY;
+    while(((prevCE & strengthMask[strength]) != (CE & strengthMask[strength]) 
+        || (prevContCE & strengthMask[strength]) != (contCE & strengthMask[strength]))
+        && strength) {
+        strength--;
+    }
+    return strength;
+                                            
+}
+
+
  static
  static
-inline int32_t ucol_inv_getPrevious(UColTokListHeader *lh, uint32_t strength) {
+inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
  
    uint32_t CE = lh->baseCE;
    uint32_t SecondCE = lh->baseContCE; 
  
  
    uint32_t CE = lh->baseCE;
    uint32_t SecondCE = lh->baseContCE; 
  
-  uint32_t *CETable = (uint32_t *)((uint8_t *)invUCA+invUCA->table);
+  uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    uint32_t previousCE, previousContCE;
    int32_t iCE;
  
    uint32_t previousCE, previousContCE;
    int32_t iCE;
  
-  iCE = ucol_inv_findCE(CE, SecondCE);
+  iCE = ucol_inv_findCE(src, CE, SecondCE);
  
    if(iCE<0) {
      return -1;
  
    if(iCE<0) {
      return -1;
@@ -207,15 +222,15 @@ inline int32_t ucol_inv_getPrevious(UColTokListHeader *lh, uint32_t strength) {
  }
  
  static
  }
  
  static
-inline int32_t ucol_inv_getNext(UColTokListHeader *lh, uint32_t strength) {
+inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
    uint32_t CE = lh->baseCE;
    uint32_t SecondCE = lh->baseContCE; 
  
    uint32_t CE = lh->baseCE;
    uint32_t SecondCE = lh->baseContCE; 
  
-  uint32_t *CETable = (uint32_t *)((uint8_t *)invUCA+invUCA->table);
+  uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    uint32_t nextCE, nextContCE;
    int32_t iCE;
  
    uint32_t nextCE, nextContCE;
    int32_t iCE;
  
-  iCE = ucol_inv_findCE(CE, SecondCE);
+  iCE = ucol_inv_findCE(src, CE, SecondCE);
  
    if(iCE<0) {
      return -1;
  
    if(iCE<0) {
      return -1;
@@ -242,7 +257,7 @@ inline int32_t ucol_inv_getNext(UColTokListHeader *lh, uint32_t strength) {
  U_CFUNC void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
    /* reset all the gaps */
    int32_t i = 0;
  U_CFUNC void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
    /* reset all the gaps */
    int32_t i = 0;
-  uint32_t *CETable = (uint32_t *)((uint8_t *)invUCA+invUCA->table);
+  uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    uint32_t st = 0;
    uint32_t t1, t2;
    int32_t pos;
    uint32_t st = 0;
    uint32_t t1, t2;
    int32_t pos;
@@ -265,7 +280,7 @@ U_CFUNC void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *l
  
    UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
  
  
    UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
  
-  if(lh->baseCE >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && lh->baseCE < (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ 
+  if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ 
    //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */ 
      lh->pos[0] = 0;
      t1 = lh->baseCE;
    //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */ 
      lh->pos[0] = 0;
      t1 = lh->baseCE;
@@ -273,16 +288,12 @@ U_CFUNC void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *l
      lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
      lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
      lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
      lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
      lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
      lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
-    if(lh->baseCE < 0xEF000000) {
-    /* first implicits have three byte primaries, with a gap of one */
-    /* so we esentially need to add 2 to the top byte in lh->baseContCE */
-      t2 += 0x02000000;
-    } else {
-    /* second implicits have four byte primaries, with a gap of IMPLICIT_LAST2_MULTIPLIER_ */
-    /* Now, this guy is not really accessible here, so until we find a better way to pass it */
-    /* around, we'll assume that the gap is 1 */
-      t2 += 0x00020000;
-    }
+    uint32_t primaryCE = t1 & UCOL_PRIMARYMASK | (t2 & UCOL_PRIMARYMASK) >> 16;
+    primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
+
+    t1 = primaryCE & UCOL_PRIMARYMASK | 0x0505;
+    t2 = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
+
      lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
      lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
      lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
      lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
      lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
      lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
@@ -302,7 +313,7 @@ U_CFUNC void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *l
    } else {
      for(;;) {
        if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
    } else {
      for(;;) {
        if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
-        if((lh->pos[tokStrength] = ucol_inv_getNext(lh, tokStrength)) >= 0) {
+        if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
            lh->fStrToken[tokStrength] = tok;
          } else { /* The CE must be implicit, since it's not in the table */
            /* Error */
            lh->fStrToken[tokStrength] = tok;
          } else { /* The CE must be implicit, since it's not in the table */
            /* Error */
@@ -339,9 +350,11 @@ U_CFUNC void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *l
          lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
          //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
          lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
          lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
          //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
          lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
-        pos--;
-        t1 = *(CETable+3*(pos));
-        t2 = *(CETable+3*(pos)+1);
+        //pos--;
+        //t1 = *(CETable+3*(pos));
+        //t2 = *(CETable+3*(pos)+1);
+        t1 = lh->baseCE;
+        t2 = lh->baseContCE;
          lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
          lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
          lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
          lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
          lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
          lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
@@ -402,7 +415,14 @@ U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_
    uint32_t strength = tok->strength;
    uint32_t low = lows[fStrength*3+strength];
    uint32_t high = highs[fStrength*3+strength];
    uint32_t strength = tok->strength;
    uint32_t low = lows[fStrength*3+strength];
    uint32_t high = highs[fStrength*3+strength];
-  uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
+  uint32_t maxByte = 0;
+  if(strength == UCOL_TERTIARY) {
+      maxByte = 0x3F;
+  } else if(strength == UCOL_PRIMARY) {
+      maxByte = 0xFE;
+  } else {
+      maxByte = 0xFF;
+  }
  
    uint32_t count = tok->toInsert;
  
  
    uint32_t count = tok->toInsert;
  
@@ -439,9 +459,10 @@ U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_
      if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
        high = UCOL_COMMON_TOP2<<24;
      } 
      if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
        high = UCOL_COMMON_TOP2<<24;
      } 
-    if(low < UCOL_COMMON_BOT2<<24) {
-      g->noOfRanges = ucol_allocWeights(UCOL_COMMON_TOP2<<24, high, count, maxByte, g->ranges);
-      g->current = UCOL_COMMON_BOT2;
+    if(low < (UCOL_COMMON_BOT2<<24)) {
+      g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
+      g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
+      //g->current = UCOL_COMMON_BOT2<<24;
        return g->current;
      }
    } 
        return g->current;
      }
    } 
@@ -454,7 +475,127 @@ U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_
    return g->current;
  }
  
    return g->current;
  }
  
-U_CFUNC void ucol_doCE(uint32_t *CEparts, UColToken *tok) {
+static
+uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
+  uint32_t i = 0; 
+  UChar c;
+
+  if(U_FAILURE(*status)) {
+    return 0;
+  }
+
+  if(sourceLen > resLen) {
+    *status = U_MEMORY_ALLOCATION_ERROR;
+    return 0;
+  }
+  
+  for(i = 0; i < sourceLen; i++) {
+    c = source[i];
+    if(0x3042 < c && c < 0x30ef) { /* Kana range */
+      switch(c - 0x3000) {
+      case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
+      case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
+        c++;
+        break;
+      case 0xF5:
+        c = 0x30AB;
+        break;
+      case 0xF6:
+        c = 0x30B1;
+        break;
+      }
+    }
+    resBuf[i] = c;
+  }
+  return sourceLen;
+}
+
+static
+uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
+  uint32_t i = 0; 
+  UChar c;
+
+  if(U_FAILURE(*status)) {
+    return 0;
+  }
+
+  if(sourceLen > resLen) {
+    *status = U_MEMORY_ALLOCATION_ERROR;
+    return 0;
+  }
+  
+  for(i = 0; i < sourceLen; i++) {
+    c = source[i];
+    if(0x3042 < c && c < 0x30ef) { /* Kana range */
+      switch(c - 0x3000) {
+      case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
+      case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
+        c--;
+        break;
+      case 0xAB:
+        c = 0x30F5;
+        break;
+      case 0xB1:
+        c = 0x30F6;
+        break;
+      }
+    }
+    resBuf[i] = c;
+  }
+  return sourceLen;
+}
+
+static
+uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
+  uint32_t i = 0;
+  UChar n[128];
+  uint32_t nLen = 0;
+  uint32_t uCount = 0, lCount = 0;
+
+  collIterate s;
+  uint32_t order = 0;
+
+  if(U_FAILURE(*status)) {
+    return UCOL_LOWER_CASE;
+  }
+
+  nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
+  if(U_SUCCESS(*status)) {
+    for(i = 0; i < nLen; i++) {
+      uprv_init_collIterate(UCA, &n[i], 1, &s);
+      order = ucol_getNextCE(UCA, &s, status);
+      if(isContinuation(order)) {
+        *status = U_INTERNAL_PROGRAM_ERROR;
+        return UCOL_LOWER_CASE;
+      }
+      if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
+        uCount++;
+      } else {
+        if(u_islower(n[i])) {
+          lCount++;
+        } else {
+          UChar sk[1], lk[1];
+          u_toSmallKana(&n[i], 1, sk, 1, status);
+          u_toLargeKana(&n[i], 1, lk, 1, status);
+          if(sk[0] == n[i] && lk[0] != n[i]) {
+            lCount++;
+          }
+        }
+      }
+    }
+  }
+
+  if(uCount != 0 && lCount != 0) {
+    return UCOL_MIXED_CASE;
+  } else if(uCount != 0) {
+    return UCOL_UPPER_CASE;
+  } else {
+    return UCOL_LOWER_CASE;
+  }
+}
+
+
+U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
    /* this one makes the table and stuff */
    uint32_t noOfBytes[3];
    uint32_t i;
    /* this one makes the table and stuff */
    uint32_t noOfBytes[3];
    uint32_t i;
@@ -494,6 +635,22 @@ U_CFUNC void ucol_doCE(uint32_t *CEparts, UColToken *tok) {
      tok->noOfCEs = CEi;
    }
  
      tok->noOfCEs = CEi;
    }
  
+
+  // we want to set case bits here and now, not later.
+  // Case bits handling 
+  tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
+  int32_t cSize = (tok->source & 0xFF000000) >> 24;
+  UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
+
+  if(cSize > 1) {
+    // Do it manually
+    tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
+  } else {
+    // Copy it from the UCA
+    uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
+    tok->CEs[0] |= (caseCE & 0xC0);
+  }
+
  #if UCOL_DEBUG==2
    fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
    for(i = 0; i<tok->noOfCEs; i++) {
  #if UCOL_DEBUG==2
    fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
    for(i = 0; i<tok->noOfCEs; i++) {
@@ -604,130 +761,11 @@ U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErro
          CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
        }
      }
          CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
        }
      }
-    ucol_doCE(CEparts, tok);
+    ucol_doCE(src, CEparts, tok, status);
      tok = tok->next;
    }
  }
  
      tok = tok->next;
    }
  }
  
-static
-uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
-  uint32_t i = 0; 
-  UChar c;
-
-  if(U_FAILURE(*status)) {
-    return 0;
-  }
-
-  if(sourceLen > resLen) {
-    *status = U_MEMORY_ALLOCATION_ERROR;
-    return 0;
-  }
-  
-  for(i = 0; i < sourceLen; i++) {
-    c = source[i];
-    if(0x3042 < c && c < 0x30ef) { /* Kana range */
-      switch(c - 0x3000) {
-      case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
-      case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
-        c++;
-        break;
-      case 0xF5:
-        c = 0x30AB;
-        break;
-      case 0xF6:
-        c = 0x30B1;
-        break;
-      }
-    }
-    resBuf[i] = c;
-  }
-  return sourceLen;
-}
-
-static
-uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
-  uint32_t i = 0; 
-  UChar c;
-
-  if(U_FAILURE(*status)) {
-    return 0;
-  }
-
-  if(sourceLen > resLen) {
-    *status = U_MEMORY_ALLOCATION_ERROR;
-    return 0;
-  }
-  
-  for(i = 0; i < sourceLen; i++) {
-    c = source[i];
-    if(0x3042 < c && c < 0x30ef) { /* Kana range */
-      switch(c - 0x3000) {
-      case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
-      case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
-        c--;
-        break;
-      case 0xAB:
-        c = 0x30F5;
-        break;
-      case 0xB1:
-        c = 0x30F6;
-        break;
-      }
-    }
-    resBuf[i] = c;
-  }
-  return sourceLen;
-}
-
-static
-uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
-  uint32_t i = 0;
-  UChar n[128];
-  uint32_t nLen = 0;
-  uint32_t uCount = 0, lCount = 0;
-
-  collIterate s;
-  uint32_t order = 0;
-
-  if(U_FAILURE(*status)) {
-    return UCOL_LOWER_CASE;
-  }
-
-  nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
-  if(U_SUCCESS(*status)) {
-    for(i = 0; i < nLen; i++) {
-      uprv_init_collIterate(UCA, &n[i], 1, &s);
-      order = ucol_getNextCE(UCA, &s, status);
-      if(isContinuation(order)) {
-        *status = U_INTERNAL_PROGRAM_ERROR;
-        return UCOL_LOWER_CASE;
-      }
-      if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
-        uCount++;
-      } else {
-        if(u_islower(n[i])) {
-          lCount++;
-        } else {
-          UChar sk[1], lk[1];
-          u_toSmallKana(&n[i], 1, sk, 1, status);
-          u_toLargeKana(&n[i], 1, lk, 1, status);
-          if(sk[0] == n[i] && lk[0] != n[i]) {
-            lCount++;
-          }
-        }
-      }
-    }
-  }
-
-  if(uCount != 0 && lCount != 0) {
-    return UCOL_MIXED_CASE;
-  } else if(uCount != 0) {
-    return UCOL_UPPER_CASE;
-  } else {
-    return UCOL_LOWER_CASE;
-  }
-}
-
  U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
    UCAElements el;
    UColToken *tok = lh->first;
  U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
    UCAElements el;
    UColToken *tok = lh->first;
@@ -834,6 +872,8 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
        }
      }
  
        }
      }
  
+#if 0
+    // we do case bits in doCE now, since we will mess up expansions otherwise.
      // Case bits handling 
      el.CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
      if(el.cSize > 1) {
      // Case bits handling 
      el.CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
      if(el.cSize > 1) {
@@ -844,6 +884,7 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
        uint32_t caseCE = ucol_getFirstCE(src->UCA, el.cPoints[0], status);
        el.CEs[0] |= (caseCE & 0xC0);
      }
        uint32_t caseCE = ucol_getFirstCE(src->UCA, el.cPoints[0], status);
        el.CEs[0] |= (caseCE & 0xC0);
      }
+#endif
  
      /* and then, add it */
  #if UCOL_DEBUG==2
  
      /* and then, add it */
  #if UCOL_DEBUG==2
@@ -1036,7 +1077,12 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
      /* We stuff the initial value in the buffers, and increase the appropriate buffer */
      /* According to strength                                                          */
      if(U_SUCCESS(*status)) {
      /* We stuff the initial value in the buffers, and increase the appropriate buffer */
      /* According to strength                                                          */
      if(U_SUCCESS(*status)) {
-      ucol_initBuffers(src, &src->lh[i], status);
+      if(src->lh[i].first) { // if there are any elements
+        // due to the way parser works, subsequent tailorings
+        // may remove all the elements from a sequence, therefore
+        // leaving an empty tailoring sequence.
+        ucol_initBuffers(src, &src->lh[i], status);
+      }
      }
      if(U_FAILURE(*status)) {
        return NULL;
      }
      if(U_FAILURE(*status)) {
        return NULL;
@@ -1062,7 +1108,7 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
    }
  
  
    }
  
  
-  tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, status);
+  tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
  
  
    /* After this, we have assigned CE values to all regular CEs      */
  
  
    /* After this, we have assigned CE values to all regular CEs      */
@@ -1129,7 +1175,7 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
              el.cSize = 2;
            }
            ucol_setText(ucaEl, el.uchars, el.cSize, status);
              el.cSize = 2;
            }
            ucol_setText(ucaEl, el.uchars, el.cSize, status);
-          while ((el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
+          while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
              el.noOfCEs++;
            }
            uprv_uca_addAnElement(t, &el, status);
              el.noOfCEs++;
            }
            uprv_uca_addAnElement(t, &el, status);
@@ -1161,14 +1207,16 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
    return myData;
  }
  
    return myData;
  }
  
-UBool
+U_CDECL_BEGIN
+static UBool U_CALLCONV
  ucol_bld_cleanup(void)
  {
      udata_close(invUCA_DATA_MEM);
      invUCA_DATA_MEM = NULL;
  ucol_bld_cleanup(void)
  {
      udata_close(invUCA_DATA_MEM);
      invUCA_DATA_MEM = NULL;
-    invUCA = NULL;
+    _staticInvUCA = NULL;
      return TRUE;
  }
      return TRUE;
  }
+U_CDECL_END
  
  U_CAPI const InverseUCATableHeader * U_EXPORT2
  ucol_initInverseUCA(UErrorCode *status)
  
  U_CAPI const InverseUCATableHeader * U_EXPORT2
  ucol_initInverseUCA(UErrorCode *status)
@@ -1176,7 +1224,7 @@ ucol_initInverseUCA(UErrorCode *status)
      if(U_FAILURE(*status)) return NULL;
  
      umtx_lock(NULL);
      if(U_FAILURE(*status)) return NULL;
  
      umtx_lock(NULL);
-    UBool f = (invUCA == NULL);
+    UBool f = (_staticInvUCA == NULL);
      umtx_unlock(NULL);
      
      if(f) {
      umtx_unlock(NULL);
      
      if(f) {
@@ -1203,8 +1251,8 @@ ucol_initInverseUCA(UErrorCode *status)
              }
              
              umtx_lock(NULL);
              }
              
              umtx_lock(NULL);
-            if(invUCA == NULL) {
-                invUCA = newInvUCA;
+            if(_staticInvUCA == NULL) {
+                _staticInvUCA = newInvUCA;
                  invUCA_DATA_MEM = result;
                  result = NULL;
                  newInvUCA = NULL;
                  invUCA_DATA_MEM = result;
                  result = NULL;
                  newInvUCA = NULL;
@@ -1218,11 +1266,11 @@ ucol_initInverseUCA(UErrorCode *status)
                  //uprv_free(newInvUCA);
              }
              else {
                  //uprv_free(newInvUCA);
              }
              else {
-                ucln_i18n_registerCleanup();
+                ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
              }
          }
      }
              }
          }
      }
-    return invUCA;
+    return _staticInvUCA;
  }
  
  #endif /* #if !UCONFIG_NO_COLLATION */
  }
  
  #endif /* #if !UCONFIG_NO_COLLATION */