ICU-6.2.4.tar.gz

[apple/icu.git] / icuSources / i18n / ucol_elm.cpp
diff --git a/icuSources/i18n/ucol_elm.cpp b/icuSources/i18n/ucol_elm.cpp

index 60adbbcbc97378ad61049b3ee67f5ef2266b3c6b..dd07039157f631adee4d8024f02b6f5787745270 100644 (file)
--- a/icuSources/i18n/ucol_elm.cpp
+++ b/icuSources/i18n/ucol_elm.cpp
@@ -1,7 +1,7 @@
  /*
  *******************************************************************************
  *
-*   Copyright (C) 2001-2003, International Business Machines
+*   Copyright (C) 2001-2004, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *
  *******************************************************************************
@@ -108,7 +108,7 @@ static int32_t uprv_uca_addExpansion(ExpansionTable *expansions, uint32_t value,
  }
  
  U_CAPI tempUCATable*  U_EXPORT2
-uprv_uca_initTempTable(UCATableHeader *image, UColOptionSet *opts, const UCollator *UCA, UColCETags initTag, UErrorCode *status) {
+uprv_uca_initTempTable(UCATableHeader *image, UColOptionSet *opts, const UCollator *UCA, UColCETags initTag, UColCETags supplementaryInitTag, UErrorCode *status) {
    tempUCATable *t = (tempUCATable *)uprv_malloc(sizeof(tempUCATable));
    /* test for NULL */
    if (t == NULL) {
@@ -147,7 +147,12 @@ uprv_uca_initTempTable(UCATableHeader *image, UColOptionSet *opts, const UCollat
    }
    uprv_memset(t->expansions, 0, sizeof(ExpansionTable));
    /*t->mapping = ucmpe32_open(UCOL_SPECIAL_FLAG | (initTag<<24), UCOL_SPECIAL_FLAG | (SURROGATE_TAG<<24), UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG<<24), status);*/
-  t->mapping = utrie_open(NULL, NULL, 0x100000, UCOL_SPECIAL_FLAG | (initTag<<24), TRUE); // Do your own mallocs for the structure, array and have linear Latin 1
+  /*t->mapping = utrie_open(NULL, NULL, 0x100000, UCOL_SPECIAL_FLAG | (initTag<<24), TRUE); // Do your own mallocs for the structure, array and have linear Latin 1*/
+
+  t->mapping = utrie_open(NULL, NULL, 0x100000,
+                          UCOL_SPECIAL_FLAG | (initTag<<24),
+                          UCOL_SPECIAL_FLAG | (supplementaryInitTag << 24),
+                          TRUE); // Do your own mallocs for the structure, array and have linear Latin 1
    t->prefixLookup = uhash_open(prefixLookupHash, prefixLookupComp, status);
    uhash_setValueDeleter(t->prefixLookup, uhash_freeBlock);
  
@@ -254,7 +259,7 @@ uprv_uca_cloneTempTable(tempUCATable *t, UErrorCode *status) {
            *status = U_MEMORY_ALLOCATION_ERROR;
            return NULL;
        }
-      uprv_memcpy(r->expansions->CEs, t->expansions->CEs, sizeof(uint32_t)*t->expansions->size);
+      uprv_memcpy(r->expansions->CEs, t->expansions->CEs, sizeof(uint32_t)*t->expansions->position);
      } else {
        r->expansions->CEs = NULL;
      }
@@ -281,7 +286,7 @@ uprv_uca_cloneTempTable(tempUCATable *t, UErrorCode *status) {
            *status = U_MEMORY_ALLOCATION_ERROR;
            return NULL;
        }
-      uprv_memcpy(r->maxExpansions->endExpansionCE, t->maxExpansions->endExpansionCE, t->maxExpansions->size*sizeof(uint32_t));
+      uprv_memcpy(r->maxExpansions->endExpansionCE, t->maxExpansions->endExpansionCE, t->maxExpansions->position*sizeof(uint32_t));
      } else {
        r->maxExpansions->endExpansionCE = NULL;
      }
@@ -292,7 +297,7 @@ uprv_uca_cloneTempTable(tempUCATable *t, UErrorCode *status) {
            *status = U_MEMORY_ALLOCATION_ERROR;
            return NULL;
        }
-      uprv_memcpy(r->maxExpansions->expansionCESize, t->maxExpansions->expansionCESize, t->maxExpansions->size*sizeof(uint8_t));
+      uprv_memcpy(r->maxExpansions->expansionCESize, t->maxExpansions->expansionCESize, t->maxExpansions->position*sizeof(uint8_t));
      } else {
        r->maxExpansions->expansionCESize = NULL;
      }
@@ -317,14 +322,14 @@ uprv_uca_cloneTempTable(tempUCATable *t, UErrorCode *status) {
            *status = U_MEMORY_ALLOCATION_ERROR;
            return NULL;
        }
-      uprv_memcpy(r->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->size*sizeof(uint32_t));
+      uprv_memcpy(r->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->position*sizeof(uint32_t));
        r->maxJamoExpansions->isV = (UBool *)uprv_malloc(sizeof(UBool)*t->maxJamoExpansions->size);
        /* test for NULL */
        if (r->maxJamoExpansions->isV == NULL) {
            *status = U_MEMORY_ALLOCATION_ERROR;
            return NULL;
        }
-      uprv_memcpy(r->maxJamoExpansions->isV, t->maxJamoExpansions->isV, t->maxJamoExpansions->size*sizeof(UBool));
+      uprv_memcpy(r->maxJamoExpansions->isV, t->maxJamoExpansions->isV, t->maxJamoExpansions->position*sizeof(UBool));
      } else {
        r->maxJamoExpansions->endExpansionCE = NULL;
        r->maxJamoExpansions->isV = NULL;
@@ -401,7 +406,7 @@ uprv_uca_closeTempTable(tempUCATable *t) {
  * @param status error status
  * @returns size of the maxexpansion and maxsize used.
  */
-int uprv_uca_setMaxExpansion(uint32_t           endexpansion,
+static int uprv_uca_setMaxExpansion(uint32_t           endexpansion,
                               uint8_t            expansionsize,
                               MaxExpansionTable *maxexpansion,
                               UErrorCode        *status)
@@ -465,7 +470,7 @@ int uprv_uca_setMaxExpansion(uint32_t           endexpansion,
        start = mid;                                                           
      }                                                                        
    } 
-      
+
    if (*start == endexpansion) {                                                     
      result = start - pendexpansionce;  
    }                                                                          
@@ -473,7 +478,7 @@ int uprv_uca_setMaxExpansion(uint32_t           endexpansion,
      if (*limit == endexpansion) {                                                     
        result = limit - pendexpansionce;      
      }                                            
-      
+
    if (result > -1) {
      /* found the ce in expansion, we'll just modify the size if it is 
         smaller */
@@ -489,9 +494,9 @@ int uprv_uca_setMaxExpansion(uint32_t           endexpansion,
      int      shiftsize     = (pendexpansionce + pos) - start;
      uint32_t *shiftpos     = start + 1;
      uint8_t  *sizeshiftpos = pexpansionsize + (shiftpos - pendexpansionce);
-    
+
      /* okay need to rearrange the array into sorted order */
-    if (shiftsize == 0 || *(pendexpansionce + pos) < endexpansion) {
+    if (shiftsize == 0 /*|| *(pendexpansionce + pos) < endexpansion*/) { /* the commented part is actually both redundant and dangerous */
        *(pendexpansionce + pos + 1) = endexpansion;
        *(pexpansionsize + pos + 1)  = expansionsize;
      }
@@ -543,7 +548,7 @@ int uprv_uca_setMaxExpansion(uint32_t           endexpansion,
  * @param status error status
  * @returns size of the maxexpansion and maxsize used.
  */
-int uprv_uca_setMaxJamoExpansion(UChar                  ch,
+static int uprv_uca_setMaxJamoExpansion(UChar                  ch,
                                   uint32_t               endexpansion,
                                   uint8_t                expansionsize,
                                   MaxJamoExpansionTable *maxexpansion,
@@ -626,7 +631,7 @@ int uprv_uca_setMaxJamoExpansion(UChar                  ch,
    *(pendexpansionce + maxexpansion->position) = endexpansion;
    *(maxexpansion->isV + maxexpansion->position) = isV;
    maxexpansion->position ++;
-  
+
    return maxexpansion->position;
  }
  
@@ -700,7 +705,7 @@ static void uprv_uca_unsafeCPAddCCNZ(tempUCATable *t, UErrorCode *status) {
      }
  }
  
-uint32_t uprv_uca_addPrefix(tempUCATable *t, uint32_t CE, 
+static uint32_t uprv_uca_addPrefix(tempUCATable *t, uint32_t CE, 
                                   UCAElements *element, UErrorCode *status) {
    // currently the longest prefix we're supporting in Japanese is two characters
    // long. Although this table could quite easily mimic complete contraction stuff
@@ -820,7 +825,7 @@ uint32_t uprv_uca_addPrefix(tempUCATable *t, uint32_t CE,
  // in the contraction, it is going to be handled as a pair of code units,
  // as it doesn't affect the performance AND handling surrogates specially
  // would complicate code way too much.
-uint32_t uprv_uca_addContraction(tempUCATable *t, uint32_t CE, 
+static uint32_t uprv_uca_addContraction(tempUCATable *t, uint32_t CE, 
                                   UCAElements *element, UErrorCode *status) {
      CntTable *contractions = t->contractions;
      UChar32 cp;
@@ -1007,26 +1012,12 @@ uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status)
    if(U_FAILURE(*status)) {
        return 0xFFFF;
    }
+
+  element->mapCE = 0; // clear mapCE so that we can catch expansions
+
    if(element->noOfCEs == 1) {
      if(element->isThai == FALSE) {
-               UChar32 uniChar = 0;
-               //printElement(element);
-               if ((element->cSize == 2) && U16_IS_LEAD(element->uchars[0])){
-                       uniChar = U16_GET_SUPPLEMENTARY(element->uchars[0], element->uchars[1]);
-                       
-               } else if (element->cSize == 1){
-                       uniChar = element->uchars[0];
-
-               }
-               
-               if (uniChar != 0 && u_isdigit(uniChar)){
-                       expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (DIGIT_TAG<<UCOL_TAG_SHIFT) 
-                       | ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
-                       | 0x1);
-                       element->mapCE = expansion;
-                       unsafeCPSet(t->unsafeCP, uniChar);
-               }else 
-                 element->mapCE = element->CEs[0];      
+          element->mapCE = element->CEs[0];      
      } else { /* add thai - totally bad here */
        expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (THAI_TAG<<UCOL_TAG_SHIFT) 
          | ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4) 
@@ -1060,27 +1051,10 @@ uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status)
          | ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary
          | ((element->CEs[1]>>24) & 0xFF);   // third byte of primary
      } else {
-    
-    /* Checking here to see if we should insert the DIGIT_TAG or the EXPANSION_TAG */
-      UChar32 uniChar = 0;
-      
-         if ((element->cSize == 2) && U16_IS_LEAD(element->uchars[0])){
-                  uniChar = U16_GET_SUPPLEMENTARY(element->uchars[0], element->uchars[1]);     
-          } else if (element->cSize == 1){
-                       uniChar = element->uchars[0];
-          }
-              
-      if (uniChar != 0 && u_isdigit(uniChar)){
-          expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (DIGIT_TAG<<UCOL_TAG_SHIFT) 
-                             | ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
-                                         | 0x1);
-                 unsafeCPSet(t->unsafeCP, uniChar);
-      }else{           
-         expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT) 
-               | ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
-               & 0xFFFFF0);
-         }
-               
+      expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT) 
+        | ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
+        & 0xFFFFF0);
+        
        for(i = 1; i<element->noOfCEs; i++) {
          uprv_uca_addExpansion(expansions, element->CEs[i], status);
        }
@@ -1105,6 +1079,37 @@ uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status)
      }
    }
  
+  // We treat digits differently - they are "uber special" and should be
+  // processed differently if numeric collation is on. 
+  UChar32 uniChar = 0;
+  //printElement(element);
+  if ((element->cSize == 2) && U16_IS_LEAD(element->uchars[0])){
+      uniChar = U16_GET_SUPPLEMENTARY(element->uchars[0], element->uchars[1]);
+  } else if (element->cSize == 1){
+      uniChar = element->uchars[0];
+  }
+
+  // Here, we either have one normal CE OR mapCE is set. Therefore, we stuff only
+  // one element to the expansion buffer. When we encounter a digit and we don't 
+  // do numeric collation, we will just pick the CE we have and break out of case
+  // (see ucol.cpp ucol_prv_getSpecialCE && ucol_prv_getSpecialPrevCE). If we picked
+  // a special, further processing will occur. If it's a simple CE, we'll return due
+  // to how the loop is constructed.
+  if (uniChar != 0 && u_isdigit(uniChar)){
+      expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (DIGIT_TAG<<UCOL_TAG_SHIFT) | 1); // prepare the element
+      if(element->mapCE) { // if there is an expansion, we'll pick it here
+        expansion |= ((uprv_uca_addExpansion(expansions, element->mapCE, status)+(headersize>>2))<<4);
+      } else {
+        expansion |= ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4);
+      }
+      element->mapCE = expansion;
+      
+      // Need to go back to the beginning of the digit string if in the middle!
+      if(uniChar <= 0xFFFF) { // supplementaries are always unsafe. API takes UChars
+        unsafeCPSet(t->unsafeCP, (UChar)uniChar);
+      }
+  }
+
    // here we want to add the prefix structure.
    // I will try to process it as a reverse contraction, if possible.
    // prefix buffer is already reversed.
@@ -1176,7 +1181,7 @@ uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status)
  
  
  /*void uprv_uca_getMaxExpansionJamo(CompactEIntArray       *mapping, */
-void uprv_uca_getMaxExpansionJamo(UNewTrie       *mapping, 
+static void uprv_uca_getMaxExpansionJamo(UNewTrie       *mapping, 
                                    MaxExpansionTable     *maxexpansion,
                                    MaxJamoExpansionTable *maxjamoexpansion,
                                    UBool                  jamospecial,
@@ -1186,7 +1191,7 @@ void uprv_uca_getMaxExpansionJamo(UNewTrie       *mapping,
    const uint32_t TBASE  = 0x11A8;
    const uint32_t VCOUNT = 21;
    const uint32_t TCOUNT = 28;
-  
+
    uint32_t v = VBASE + VCOUNT - 1;
    uint32_t t = TBASE + TCOUNT - 1;
    uint32_t ce;
@@ -1346,7 +1351,20 @@ uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
      }
  
      UCATableHeader *myData = (UCATableHeader *)dataStart;
-    uprv_memcpy(myData, t->image, sizeof(UCATableHeader));
+    // Please, do reset all the fields!
+    uprv_memset(dataStart, 0, toAllocate);
+    // Make sure we know this is reset
+    myData->magic = UCOL_HEADER_MAGIC;
+    myData->isBigEndian = U_IS_BIG_ENDIAN;
+    myData->charSetFamily = U_CHARSET_FAMILY;
+    myData->formatVersion[0] = UCA_FORMAT_VERSION_0;
+    myData->formatVersion[1] = UCA_FORMAT_VERSION_1;
+    myData->formatVersion[2] = UCA_FORMAT_VERSION_2;
+    myData->formatVersion[3] = UCA_FORMAT_VERSION_3;
+    myData->jamoSpecial = t->image->jamoSpecial;
+
+    // Don't copy stuff from UCA header!
+    //uprv_memcpy(myData, t->image, sizeof(UCATableHeader));
  
      myData->contractionSize = contractionsSize;
  
@@ -1377,7 +1395,7 @@ uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
        tableOffset += (uint32_t)(paddedsize(contractionsSize*sizeof(uint32_t)));
      } else {
        myData->contractionIndex = 0;
-      myData->contractionIndex = 0;
+      myData->contractionCEs = 0;
      }
  
      /* copy mapping table */
@@ -1391,11 +1409,13 @@ uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
      // This is debug code to dump the contents of the trie. It needs two functions defined above
      {
        UTrie UCAt = { 0 };
+      uint32_t trieWord;
        utrie_unserialize(&UCAt, dataStart+tableOffset, 9999999, status);
        UCAt.getFoldingOffset = myGetFoldingOffset;
        if(U_SUCCESS(*status)) {
          utrie_enum(&UCAt, NULL, enumRange, NULL);
        }
+      trieWord = UTRIE_GET32_FROM_LEAD(UCAt, 0xDC01) 
      }
  #endif
      tableOffset += paddedsize(mappingSize);
@@ -1510,7 +1530,7 @@ _enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 li
              el.prefixSize = 0;
              el.noOfCEs = 0;
              ucol_setText(colEl, decomp, noOfDec, status);
-            while((el.CEs[el.noOfCEs] = ucol_next(colEl, status)) != UCOL_NULLORDER) {
+            while((el.CEs[el.noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) {
                el.noOfCEs++;
              }
            } else {
@@ -1551,7 +1571,7 @@ uprv_uca_canonicalClosure(tempUCATable *t, UErrorCode *status)
      tempUCATable *tempTable = uprv_uca_cloneTempTable(t, status);
  
      UCATableHeader *tempData = uprv_uca_assembleTable(tempTable, status);
-    tempColl = ucol_initCollator(tempData, 0, status);
+    tempColl = ucol_initCollator(tempData, 0, t->UCA, status);
      uprv_uca_closeTempTable(tempTable);    
  
      if(U_SUCCESS(*status)) {
@@ -1582,4 +1602,6 @@ uprv_uca_canonicalClosure(tempUCATable *t, UErrorCode *status)
  
  U_NAMESPACE_END
  
-#endif /* #if !UCONFIG_NO_COLLATION */
-\ No newline at end of file
+#endif /* #if !UCONFIG_NO_COLLATION */
+
+