2 ******************************************************************************* 
   3 *   Copyright (C) 2010-2012, International Business Machines 
   4 *   Corporation and others.  All Rights Reserved. 
   5 ******************************************************************************* 
   6 *   file name:  ucharstriebuilder.h 
   8 *   tab size:   8 (not used) 
  11 *   created on: 2010nov14 
  12 *   created by: Markus W. Scherer 
  15 #include "unicode/utypes.h" 
  16 #include "unicode/ucharstrie.h" 
  17 #include "unicode/ucharstriebuilder.h" 
  18 #include "unicode/unistr.h" 
  19 #include "unicode/ustring.h" 
  29  * Note: This builder implementation stores (string, value) pairs with full copies 
  30  * of the 16-bit-unit sequences, until the UCharsTrie is built. 
  31  * It might(!) take less memory if we collected the data in a temporary, dynamic trie. 
  34 class UCharsTrieElement 
: public UMemory 
{ 
  36     // Use compiler's default constructor, initializes nothing. 
  38     void setTo(const UnicodeString 
&s
, int32_t val
, UnicodeString 
&strings
, UErrorCode 
&errorCode
); 
  40     UnicodeString 
getString(const UnicodeString 
&strings
) const { 
  41         int32_t length
=strings
[stringOffset
]; 
  42         return strings
.tempSubString(stringOffset
+1, length
); 
  44     int32_t getStringLength(const UnicodeString 
&strings
) const { 
  45         return strings
[stringOffset
]; 
  48     UChar 
charAt(int32_t index
, const UnicodeString 
&strings
) const { 
  49         return strings
[stringOffset
+1+index
]; 
  52     int32_t getValue() const { return value
; } 
  54     int32_t compareStringTo(const UCharsTrieElement 
&o
, const UnicodeString 
&strings
) const; 
  57     // The first strings unit contains the string length. 
  58     // (Compared with a stringLength field here, this saves 2 bytes per string.) 
  64 UCharsTrieElement::setTo(const UnicodeString 
&s
, int32_t val
, 
  65                          UnicodeString 
&strings
, UErrorCode 
&errorCode
) { 
  66     if(U_FAILURE(errorCode
)) { 
  69     int32_t length
=s
.length(); 
  71         // Too long: We store the length in 1 unit. 
  72         errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
; 
  75     stringOffset
=strings
.length(); 
  76     strings
.append((UChar
)length
); 
  82 UCharsTrieElement::compareStringTo(const UCharsTrieElement 
&other
, const UnicodeString 
&strings
) const { 
  83     return getString(strings
).compare(other
.getString(strings
)); 
  86 UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode 
& /*errorCode*/) 
  87         : elements(NULL
), elementsCapacity(0), elementsLength(0), 
  88           uchars(NULL
), ucharsCapacity(0), ucharsLength(0) {} 
  90 UCharsTrieBuilder::~UCharsTrieBuilder() { 
  96 UCharsTrieBuilder::add(const UnicodeString 
&s
, int32_t value
, UErrorCode 
&errorCode
) { 
  97     if(U_FAILURE(errorCode
)) { 
 101         // Cannot add elements after building. 
 102         errorCode
=U_NO_WRITE_PERMISSION
; 
 105     if(elementsLength
==elementsCapacity
) { 
 107         if(elementsCapacity
==0) { 
 110             newCapacity
=4*elementsCapacity
; 
 112         UCharsTrieElement 
*newElements
=new UCharsTrieElement
[newCapacity
]; 
 113         if(newElements
==NULL
) { 
 114             errorCode
=U_MEMORY_ALLOCATION_ERROR
; 
 117         if(elementsLength
>0) { 
 118             uprv_memcpy(newElements
, elements
, elementsLength
*sizeof(UCharsTrieElement
)); 
 121         elements
=newElements
; 
 122         elementsCapacity
=newCapacity
; 
 124     elements
[elementsLength
++].setTo(s
, value
, strings
, errorCode
); 
 125     if(U_SUCCESS(errorCode
) && strings
.isBogus()) { 
 126         errorCode
=U_MEMORY_ALLOCATION_ERROR
; 
 133 static int32_t U_CALLCONV
 
 134 compareElementStrings(const void *context
, const void *left
, const void *right
) { 
 135     const UnicodeString 
*strings
=static_cast<const UnicodeString 
*>(context
); 
 136     const UCharsTrieElement 
*leftElement
=static_cast<const UCharsTrieElement 
*>(left
); 
 137     const UCharsTrieElement 
*rightElement
=static_cast<const UCharsTrieElement 
*>(right
); 
 138     return leftElement
->compareStringTo(*rightElement
, *strings
); 
 144 UCharsTrieBuilder::build(UStringTrieBuildOption buildOption
, UErrorCode 
&errorCode
) { 
 145     buildUChars(buildOption
, errorCode
); 
 146     UCharsTrie 
*newTrie
=NULL
; 
 147     if(U_SUCCESS(errorCode
)) { 
 148         newTrie
=new UCharsTrie(uchars
, uchars
+(ucharsCapacity
-ucharsLength
)); 
 150             errorCode
=U_MEMORY_ALLOCATION_ERROR
; 
 152             uchars
=NULL
;  // The new trie now owns the array. 
 160 UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption
, UnicodeString 
&result
, 
 161                                       UErrorCode 
&errorCode
) { 
 162     buildUChars(buildOption
, errorCode
); 
 163     if(U_SUCCESS(errorCode
)) { 
 164         result
.setTo(FALSE
, uchars
+(ucharsCapacity
-ucharsLength
), ucharsLength
); 
 170 UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption
, UErrorCode 
&errorCode
) { 
 171     if(U_FAILURE(errorCode
)) { 
 174     if(uchars
!=NULL 
&& ucharsLength
>0) { 
 178     if(ucharsLength
==0) { 
 179         if(elementsLength
==0) { 
 180             errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
; 
 183         if(strings
.isBogus()) { 
 184             errorCode
=U_MEMORY_ALLOCATION_ERROR
; 
 187         uprv_sortArray(elements
, elementsLength
, (int32_t)sizeof(UCharsTrieElement
), 
 188                       compareElementStrings
, &strings
, 
 189                       FALSE
,  // need not be a stable sort 
 191         if(U_FAILURE(errorCode
)) { 
 194         // Duplicate strings are not allowed. 
 195         UnicodeString prev
=elements
[0].getString(strings
); 
 196         for(int32_t i
=1; i
<elementsLength
; ++i
) { 
 197             UnicodeString current
=elements
[i
].getString(strings
); 
 199                 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
; 
 202             prev
.fastCopyFrom(current
); 
 205     // Create and UChar-serialize the trie for the elements. 
 207     int32_t capacity
=strings
.length(); 
 211     if(ucharsCapacity
<capacity
) { 
 213         uchars
=static_cast<UChar 
*>(uprv_malloc(capacity
*2)); 
 215             errorCode
=U_MEMORY_ALLOCATION_ERROR
; 
 219         ucharsCapacity
=capacity
; 
 221     StringTrieBuilder::build(buildOption
, elementsLength
, errorCode
); 
 223         errorCode
=U_MEMORY_ALLOCATION_ERROR
; 
 228 UCharsTrieBuilder::getElementStringLength(int32_t i
) const { 
 229     return elements
[i
].getStringLength(strings
); 
 233 UCharsTrieBuilder::getElementUnit(int32_t i
, int32_t unitIndex
) const { 
 234     return elements
[i
].charAt(unitIndex
, strings
); 
 238 UCharsTrieBuilder::getElementValue(int32_t i
) const { 
 239     return elements
[i
].getValue(); 
 243 UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first
, int32_t last
, int32_t unitIndex
) const { 
 244     const UCharsTrieElement 
&firstElement
=elements
[first
]; 
 245     const UCharsTrieElement 
&lastElement
=elements
[last
]; 
 246     int32_t minStringLength
=firstElement
.getStringLength(strings
); 
 247     while(++unitIndex
<minStringLength 
&& 
 248             firstElement
.charAt(unitIndex
, strings
)== 
 249             lastElement
.charAt(unitIndex
, strings
)) {} 
 254 UCharsTrieBuilder::countElementUnits(int32_t start
, int32_t limit
, int32_t unitIndex
) const { 
 255     int32_t length
=0;  // Number of different units at unitIndex. 
 258         UChar unit
=elements
[i
++].charAt(unitIndex
, strings
); 
 259         while(i
<limit 
&& unit
==elements
[i
].charAt(unitIndex
, strings
)) { 
 268 UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i
, int32_t unitIndex
, int32_t count
) const { 
 270         UChar unit
=elements
[i
++].charAt(unitIndex
, strings
); 
 271         while(unit
==elements
[i
].charAt(unitIndex
, strings
)) { 
 279 UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i
, int32_t unitIndex
, UChar unit
) const { 
 280     while(unit
==elements
[i
].charAt(unitIndex
, strings
)) { 
 286 UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar 
*units
, int32_t len
, Node 
*nextNode
) 
 287         : LinearMatchNode(len
, nextNode
), s(units
) { 
 288     hash
=hash
*37+ustr_hashUCharsN(units
, len
); 
 292 UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node 
&other
) const { 
 296     if(!LinearMatchNode::operator==(other
)) { 
 299     const UCTLinearMatchNode 
&o
=(const UCTLinearMatchNode 
&)other
; 
 300     return 0==u_memcmp(s
, o
.s
, length
); 
 304 UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder 
&builder
) { 
 305     UCharsTrieBuilder 
&b
=(UCharsTrieBuilder 
&)builder
; 
 306     next
->write(builder
); 
 308     offset
=b
.writeValueAndType(hasValue
, value
, b
.getMinLinearMatch()+length
-1); 
 311 StringTrieBuilder::Node 
* 
 312 UCharsTrieBuilder::createLinearMatchNode(int32_t i
, int32_t unitIndex
, int32_t length
, 
 313                                          Node 
*nextNode
) const { 
 314     return new UCTLinearMatchNode( 
 315             elements
[i
].getString(strings
).getBuffer()+unitIndex
, 
 321 UCharsTrieBuilder::ensureCapacity(int32_t length
) { 
 323         return FALSE
;  // previous memory allocation had failed 
 325     if(length
>ucharsCapacity
) { 
 326         int32_t newCapacity
=ucharsCapacity
; 
 329         } while(newCapacity
<=length
); 
 330         UChar 
*newUChars
=static_cast<UChar 
*>(uprv_malloc(newCapacity
*2)); 
 331         if(newUChars
==NULL
) { 
 332             // unable to allocate memory 
 338         u_memcpy(newUChars
+(newCapacity
-ucharsLength
), 
 339                  uchars
+(ucharsCapacity
-ucharsLength
), ucharsLength
); 
 342         ucharsCapacity
=newCapacity
; 
 348 UCharsTrieBuilder::write(int32_t unit
) { 
 349     int32_t newLength
=ucharsLength
+1; 
 350     if(ensureCapacity(newLength
)) { 
 351         ucharsLength
=newLength
; 
 352         uchars
[ucharsCapacity
-ucharsLength
]=(UChar
)unit
; 
 358 UCharsTrieBuilder::write(const UChar 
*s
, int32_t length
) { 
 359     int32_t newLength
=ucharsLength
+length
; 
 360     if(ensureCapacity(newLength
)) { 
 361         ucharsLength
=newLength
; 
 362         u_memcpy(uchars
+(ucharsCapacity
-ucharsLength
), s
, length
); 
 368 UCharsTrieBuilder::writeElementUnits(int32_t i
, int32_t unitIndex
, int32_t length
) { 
 369     return write(elements
[i
].getString(strings
).getBuffer()+unitIndex
, length
); 
 373 UCharsTrieBuilder::writeValueAndFinal(int32_t i
, UBool isFinal
) { 
 374     if(0<=i 
&& i
<=UCharsTrie::kMaxOneUnitValue
) { 
 375         return write(i
|(isFinal
<<15)); 
 379     if(i
<0 || i
>UCharsTrie::kMaxTwoUnitValue
) { 
 380         intUnits
[0]=(UChar
)(UCharsTrie::kThreeUnitValueLead
); 
 381         intUnits
[1]=(UChar
)((uint32_t)i
>>16); 
 382         intUnits
[2]=(UChar
)i
; 
 384     // } else if(i<=UCharsTrie::kMaxOneUnitValue) { 
 385     //     intUnits[0]=(UChar)(i); 
 388         intUnits
[0]=(UChar
)(UCharsTrie::kMinTwoUnitValueLead
+(i
>>16)); 
 389         intUnits
[1]=(UChar
)i
; 
 392     intUnits
[0]=(UChar
)(intUnits
[0]|(isFinal
<<15)); 
 393     return write(intUnits
, length
); 
 397 UCharsTrieBuilder::writeValueAndType(UBool hasValue
, int32_t value
, int32_t node
) { 
 403     if(value
<0 || value
>UCharsTrie::kMaxTwoUnitNodeValue
) { 
 404         intUnits
[0]=(UChar
)(UCharsTrie::kThreeUnitNodeValueLead
); 
 405         intUnits
[1]=(UChar
)((uint32_t)value
>>16); 
 406         intUnits
[2]=(UChar
)value
; 
 408     } else if(value
<=UCharsTrie::kMaxOneUnitNodeValue
) { 
 409         intUnits
[0]=(UChar
)((value
+1)<<6); 
 412         intUnits
[0]=(UChar
)(UCharsTrie::kMinTwoUnitNodeValueLead
+((value
>>10)&0x7fc0)); 
 413         intUnits
[1]=(UChar
)value
; 
 416     intUnits
[0]|=(UChar
)node
; 
 417     return write(intUnits
, length
); 
 421 UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget
) { 
 422     int32_t i
=ucharsLength
-jumpTarget
; 
 424     if(i
<=UCharsTrie::kMaxOneUnitDelta
) { 
 429     if(i
<=UCharsTrie::kMaxTwoUnitDelta
) { 
 430         intUnits
[0]=(UChar
)(UCharsTrie::kMinTwoUnitDeltaLead
+(i
>>16)); 
 433         intUnits
[0]=(UChar
)(UCharsTrie::kThreeUnitDeltaLead
); 
 434         intUnits
[1]=(UChar
)(i
>>16); 
 437     intUnits
[length
++]=(UChar
)i
; 
 438     return write(intUnits
, length
);