]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
46f4442e | 4 | * Copyright (C) 2000-2008, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: genuca.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created at the end of XX century | |
14 | * created by: Vladimir Weinstein | |
15 | * | |
16 | * This program reads the Franctional UCA table and generates | |
17 | * internal format for UCA table as well as inverse UCA table. | |
18 | * It then writes binary files containing the data: ucadata.dat | |
19 | * & invuca.dat | |
20 | * Change history: | |
21 | * 02/23/2001 grhoten Made it into a tool | |
22 | * 02/23/2001 weiv Moved element & table handling code to i18n | |
23 | * 05/09/2001 weiv Case bits are now in the CEs, not in front | |
24 | */ | |
25 | ||
b75a7d8f | 26 | #include "unicode/utypes.h" |
374ca955 | 27 | #include "unicode/putil.h" |
b75a7d8f | 28 | #include "unicode/udata.h" |
374ca955 | 29 | #include "unicode/uclean.h" |
b75a7d8f A |
30 | #include "ucol_imp.h" |
31 | #include "genuca.h" | |
32 | #include "uoptions.h" | |
33 | #include "toolutil.h" | |
34 | #include "unewdata.h" | |
35 | #include "cstring.h" | |
36 | #include "cmemory.h" | |
37 | ||
374ca955 A |
38 | #include <stdio.h> |
39 | ||
b75a7d8f A |
40 | /* |
41 | * Global - verbosity | |
42 | */ | |
43 | UBool VERBOSE = FALSE; | |
44 | ||
45 | static UVersionInfo UCAVersion; | |
46 | ||
47 | #if UCONFIG_NO_COLLATION | |
48 | ||
49 | /* dummy UDataInfo cf. udata.h */ | |
50 | static UDataInfo dummyDataInfo = { | |
51 | sizeof(UDataInfo), | |
52 | 0, | |
53 | ||
54 | U_IS_BIG_ENDIAN, | |
55 | U_CHARSET_FAMILY, | |
56 | U_SIZEOF_UCHAR, | |
57 | 0, | |
58 | ||
59 | { 0, 0, 0, 0 }, /* dummy dataFormat */ | |
60 | { 0, 0, 0, 0 }, /* dummy formatVersion */ | |
61 | { 0, 0, 0, 0 } /* dummy dataVersion */ | |
62 | }; | |
63 | ||
64 | #else | |
65 | ||
374ca955 A |
66 | static const UDataInfo ucaDataInfo={ |
67 | sizeof(UDataInfo), | |
68 | 0, | |
69 | ||
70 | U_IS_BIG_ENDIAN, | |
71 | U_CHARSET_FAMILY, | |
72 | sizeof(UChar), | |
73 | 0, | |
74 | ||
75 | {UCA_DATA_FORMAT_0, UCA_DATA_FORMAT_1, UCA_DATA_FORMAT_2, UCA_DATA_FORMAT_3}, /* dataFormat="UCol" */ | |
76 | /* 03/26/2002 bumped up version since format has changed */ | |
77 | /* 09/16/2002 bumped up version since we went from UColAttributeValue */ | |
78 | /* to int32_t in UColOptionSet */ | |
79 | /* 05/13/2003 This one also updated since we added UCA and UCD versions */ | |
80 | /* to header */ | |
81 | /* 09/11/2003 Adding information required by data swapper */ | |
82 | {UCA_FORMAT_VERSION_0, UCA_FORMAT_VERSION_1, UCA_FORMAT_VERSION_2, UCA_FORMAT_VERSION_3}, /* formatVersion */ | |
83 | {0, 0, 0, 0} /* dataVersion = Unicode Version*/ | |
84 | }; | |
85 | ||
86 | static const UDataInfo invUcaDataInfo={ | |
87 | sizeof(UDataInfo), | |
88 | 0, | |
89 | ||
90 | U_IS_BIG_ENDIAN, | |
91 | U_CHARSET_FAMILY, | |
92 | sizeof(UChar), | |
93 | 0, | |
94 | ||
95 | {INVUCA_DATA_FORMAT_0, INVUCA_DATA_FORMAT_1, INVUCA_DATA_FORMAT_2, INVUCA_DATA_FORMAT_3}, /* dataFormat="InvC" */ | |
96 | /* 03/26/2002 bumped up version since format has changed */ | |
97 | /* 04/29/2003 2.1 format - we have added UCA version to header */ | |
98 | {INVUCA_FORMAT_VERSION_0, INVUCA_FORMAT_VERSION_1, INVUCA_FORMAT_VERSION_2, INVUCA_FORMAT_VERSION_3}, /* formatVersion */ | |
99 | {0, 0, 0, 0} /* dataVersion = Unicode Version*/ | |
100 | }; | |
101 | ||
b75a7d8f A |
102 | UCAElements le; |
103 | ||
104 | int32_t readElement(char **from, char *to, char separator, UErrorCode *status) { | |
105 | if(U_FAILURE(*status)) { | |
106 | return 0; | |
107 | } | |
108 | char buffer[1024]; | |
109 | int32_t i = 0; | |
110 | while(**from != separator) { | |
111 | if(**from != ' ') { | |
112 | *(buffer+i++) = **from; | |
113 | } | |
114 | (*from)++; | |
115 | } | |
116 | (*from)++; | |
117 | *(buffer + i) = 0; | |
118 | //*to = (char *)malloc(strlen(buffer)+1); | |
119 | strcpy(to, buffer); | |
120 | return i/2; | |
121 | } | |
122 | ||
123 | ||
124 | uint32_t getSingleCEValue(char *primary, char *secondary, char *tertiary, UErrorCode *status) { | |
125 | if(U_FAILURE(*status)) { | |
126 | return 0; | |
127 | } | |
128 | uint32_t value = 0; | |
129 | char primsave = '\0'; | |
130 | char secsave = '\0'; | |
131 | char tersave = '\0'; | |
132 | char *primend = primary+4; | |
133 | if(strlen(primary) > 4) { | |
134 | primsave = *primend; | |
135 | *primend = '\0'; | |
136 | } | |
137 | char *secend = secondary+2; | |
138 | if(strlen(secondary) > 2) { | |
139 | secsave = *secend; | |
140 | *secend = '\0'; | |
141 | } | |
142 | char *terend = tertiary+2; | |
143 | if(strlen(tertiary) > 2) { | |
144 | tersave = *terend; | |
145 | *terend = '\0'; | |
146 | } | |
147 | uint32_t primvalue = (uint32_t)((*primary!='\0')?strtoul(primary, &primend, 16):0); | |
148 | uint32_t secvalue = (uint32_t)((*secondary!='\0')?strtoul(secondary, &secend, 16):0); | |
149 | uint32_t tervalue = (uint32_t)((*tertiary!='\0')?strtoul(tertiary, &terend, 16):0); | |
150 | if(primvalue <= 0xFF) { | |
151 | primvalue <<= 8; | |
152 | } | |
153 | ||
154 | value = ((primvalue<<UCOL_PRIMARYORDERSHIFT)&UCOL_PRIMARYORDERMASK)| | |
155 | ((secvalue<<UCOL_SECONDARYORDERSHIFT)&UCOL_SECONDARYORDERMASK)| | |
156 | (tervalue&UCOL_TERTIARYORDERMASK); | |
157 | ||
158 | if(primsave!='\0') { | |
159 | *primend = primsave; | |
160 | } | |
161 | if(secsave!='\0') { | |
162 | *secend = secsave; | |
163 | } | |
164 | if(tersave!='\0') { | |
165 | *terend = tersave; | |
166 | } | |
167 | return value; | |
168 | } | |
169 | ||
170 | static uint32_t inverseTable[0xFFFF][3]; | |
171 | static uint32_t inversePos = 0; | |
172 | static UChar stringContinue[0xFFFF]; | |
173 | static uint32_t sContPos = 0; | |
174 | ||
175 | static void addNewInverse(UCAElements *element, UErrorCode *status) { | |
176 | if(U_FAILURE(*status)) { | |
177 | return; | |
178 | } | |
179 | if(VERBOSE && isContinuation(element->CEs[1])) { | |
180 | //fprintf(stdout, "+"); | |
181 | } | |
182 | inversePos++; | |
183 | inverseTable[inversePos][0] = element->CEs[0]; | |
184 | if(element->noOfCEs > 1 && isContinuation(element->CEs[1])) { | |
185 | inverseTable[inversePos][1] = element->CEs[1]; | |
186 | } else { | |
187 | inverseTable[inversePos][1] = 0; | |
188 | } | |
189 | if(element->cSize < 2) { | |
190 | inverseTable[inversePos][2] = element->cPoints[0]; | |
191 | } else { /* add a new store of cruft */ | |
192 | inverseTable[inversePos][2] = ((element->cSize+1) << UCOL_INV_SHIFTVALUE) | sContPos; | |
193 | memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar)); | |
194 | sContPos += element->cSize+1; | |
195 | } | |
196 | } | |
197 | ||
198 | static void insertInverse(UCAElements *element, uint32_t position, UErrorCode *status) { | |
199 | if(U_FAILURE(*status)) { | |
200 | return; | |
201 | } | |
202 | ||
203 | if(VERBOSE && isContinuation(element->CEs[1])) { | |
204 | //fprintf(stdout, "+"); | |
205 | } | |
206 | if(position <= inversePos) { | |
207 | /*move stuff around */ | |
208 | uint32_t amountToMove = (inversePos - position+1)*sizeof(inverseTable[0]); | |
209 | uprv_memmove(inverseTable[position+1], inverseTable[position], amountToMove); | |
210 | } | |
211 | inverseTable[position][0] = element->CEs[0]; | |
212 | if(element->noOfCEs > 1 && isContinuation(element->CEs[1])) { | |
213 | inverseTable[position][1] = element->CEs[1]; | |
214 | } else { | |
215 | inverseTable[position][1] = 0; | |
216 | } | |
217 | if(element->cSize < 2) { | |
218 | inverseTable[position][2] = element->cPoints[0]; | |
219 | } else { /* add a new store of cruft */ | |
220 | inverseTable[position][2] = ((element->cSize+1) << UCOL_INV_SHIFTVALUE) | sContPos; | |
221 | memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar)); | |
222 | sContPos += element->cSize+1; | |
223 | } | |
224 | inversePos++; | |
225 | } | |
226 | ||
227 | static void addToExistingInverse(UCAElements *element, uint32_t position, UErrorCode *status) { | |
228 | ||
229 | if(U_FAILURE(*status)) { | |
230 | return; | |
231 | } | |
232 | ||
233 | if((inverseTable[position][2] & UCOL_INV_SIZEMASK) == 0) { /* single element, have to make new extension place and put both guys there */ | |
234 | stringContinue[sContPos] = (UChar)inverseTable[position][2]; | |
235 | inverseTable[position][2] = ((element->cSize+3) << UCOL_INV_SHIFTVALUE) | sContPos; | |
236 | sContPos++; | |
237 | stringContinue[sContPos++] = 0xFFFF; | |
238 | memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar)); | |
239 | sContPos += element->cSize; | |
240 | stringContinue[sContPos++] = 0xFFFE; | |
241 | } else { /* adding to the already existing continuing table */ | |
242 | uint32_t contIndex = inverseTable[position][2] & UCOL_INV_OFFSETMASK; | |
243 | uint32_t contSize = (inverseTable[position][2] & UCOL_INV_SIZEMASK) >> UCOL_INV_SHIFTVALUE; | |
244 | ||
245 | if(contIndex+contSize < sContPos) { | |
246 | /*fprintf(stderr, ".", sContPos, contIndex+contSize);*/ | |
247 | memcpy(stringContinue+contIndex+contSize+element->cSize+1, stringContinue+contIndex+contSize, (element->cSize+1)*sizeof(UChar)); | |
248 | } | |
249 | ||
250 | stringContinue[contIndex+contSize-1] = 0xFFFF; | |
251 | memcpy(stringContinue+contIndex+contSize, element->cPoints, element->cSize*sizeof(UChar)); | |
252 | sContPos += element->cSize+1; | |
253 | stringContinue[contIndex+contSize+element->cSize] = 0xFFFE; | |
254 | ||
255 | inverseTable[position][2] = ((contSize+element->cSize+1) << UCOL_INV_SHIFTVALUE) | contIndex; | |
256 | } | |
257 | } | |
258 | ||
374ca955 A |
259 | /* |
260 | * Takes two CEs (lead and continuation) and | |
261 | * compares them as CEs should be compared: | |
262 | * primary vs. primary, secondary vs. secondary | |
263 | * tertiary vs. tertiary | |
264 | */ | |
265 | static int32_t compareCEs(uint32_t *source, uint32_t *target) { | |
266 | uint32_t s1 = source[0], s2, t1 = target[0], t2; | |
267 | if(isContinuation(source[1])) { | |
268 | s2 = source[1]; | |
269 | } else { | |
270 | s2 = 0; | |
271 | } | |
272 | if(isContinuation(target[1])) { | |
273 | t2 = target[1]; | |
274 | } else { | |
275 | t2 = 0; | |
276 | } | |
277 | ||
278 | uint32_t s = 0, t = 0; | |
279 | if(s1 == t1 && s2 == t2) { | |
280 | return 0; | |
281 | } | |
282 | s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); | |
283 | t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); | |
284 | if(s < t) { | |
285 | return -1; | |
286 | } else if(s > t) { | |
287 | return 1; | |
288 | } else { | |
289 | s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8; | |
290 | t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8; | |
291 | if(s < t) { | |
292 | return -1; | |
293 | } else if(s > t) { | |
294 | return 1; | |
295 | } else { | |
296 | s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF); | |
297 | t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF); | |
298 | if(s < t) { | |
299 | return -1; | |
300 | } else { | |
301 | return 1; | |
302 | } | |
303 | } | |
304 | } | |
305 | } | |
306 | ||
b75a7d8f | 307 | static uint32_t addToInverse(UCAElements *element, UErrorCode *status) { |
b75a7d8f A |
308 | uint32_t position = inversePos; |
309 | uint32_t saveElement = element->CEs[0]; | |
374ca955 | 310 | int32_t compResult = 0; |
b75a7d8f A |
311 | element->CEs[0] &= 0xFFFFFF3F; |
312 | if(element->noOfCEs == 1) { | |
313 | element->CEs[1] = 0; | |
314 | } | |
315 | if(inversePos == 0) { | |
316 | inverseTable[0][0] = inverseTable[0][1] = inverseTable[0][2] = 0; | |
317 | addNewInverse(element, status); | |
374ca955 A |
318 | } else if(compareCEs(inverseTable[inversePos], element->CEs) > 0) { |
319 | while((compResult = compareCEs(inverseTable[--position], element->CEs)) > 0); | |
320 | if(VERBOSE) { fprintf(stdout, "p:%u ", (int)position); } | |
321 | if(compResult == 0) { | |
322 | addToExistingInverse(element, position, status); | |
323 | } else { | |
b75a7d8f A |
324 | insertInverse(element, position+1, status); |
325 | } | |
374ca955 A |
326 | } else if(compareCEs(inverseTable[inversePos], element->CEs) == 0) { |
327 | addToExistingInverse(element, inversePos, status); | |
328 | } else { | |
b75a7d8f A |
329 | addNewInverse(element, status); |
330 | } | |
331 | element->CEs[0] = saveElement; | |
332 | if(VERBOSE) { fprintf(stdout, "+"); } | |
333 | return inversePos; | |
334 | } | |
335 | ||
336 | static InverseUCATableHeader *assembleInverseTable(UErrorCode *status) | |
337 | { | |
338 | InverseUCATableHeader *result = NULL; | |
339 | uint32_t headerByteSize = paddedsize(sizeof(InverseUCATableHeader)); | |
340 | uint32_t inverseTableByteSize = (inversePos+2)*sizeof(uint32_t)*3; | |
341 | uint32_t contsByteSize = sContPos * sizeof(UChar); | |
342 | uint32_t i = 0; | |
343 | ||
344 | result = (InverseUCATableHeader *)uprv_malloc(headerByteSize + inverseTableByteSize + contsByteSize); | |
374ca955 | 345 | uprv_memset(result, 0, headerByteSize + inverseTableByteSize + contsByteSize); |
b75a7d8f A |
346 | if(result != NULL) { |
347 | result->byteSize = headerByteSize + inverseTableByteSize + contsByteSize; | |
348 | ||
349 | inversePos++; | |
350 | inverseTable[inversePos][0] = 0xFFFFFFFF; | |
351 | inverseTable[inversePos][1] = 0xFFFFFFFF; | |
352 | inverseTable[inversePos][2] = 0x0000FFFF; | |
353 | inversePos++; | |
354 | ||
355 | for(i = 2; i<inversePos; i++) { | |
374ca955 A |
356 | if(compareCEs(inverseTable[i-1], inverseTable[i]) > 0) { |
357 | fprintf(stderr, "Error at %i: %08X & %08X\n", (int)i, (int)inverseTable[i-1][0], (int)inverseTable[i][0]); | |
b75a7d8f | 358 | } else if(inverseTable[i-1][0] == inverseTable[i][0] && !(inverseTable[i-1][1] < inverseTable[i][1])) { |
374ca955 | 359 | fprintf(stderr, "Continuation error at %i: %08X %08X & %08X %08X\n", (int)i, (int)inverseTable[i-1][0], (int)inverseTable[i-1][1], (int)inverseTable[i][0], (int)inverseTable[i][1]); |
b75a7d8f A |
360 | } |
361 | } | |
362 | ||
363 | result->tableSize = inversePos; | |
364 | result->contsSize = sContPos; | |
365 | ||
366 | result->table = headerByteSize; | |
367 | result->conts = headerByteSize + inverseTableByteSize; | |
368 | ||
369 | memcpy((uint8_t *)result + result->table, inverseTable, inverseTableByteSize); | |
370 | memcpy((uint8_t *)result + result->conts, stringContinue, contsByteSize); | |
371 | ||
372 | } else { | |
373 | *status = U_MEMORY_ALLOCATION_ERROR; | |
374 | return NULL; | |
375 | } | |
376 | ||
377 | return result; | |
378 | } | |
379 | ||
380 | ||
381 | static void writeOutInverseData(InverseUCATableHeader *data, | |
382 | const char *outputDir, | |
383 | const char *copyright, | |
384 | UErrorCode *status) | |
385 | { | |
386 | UNewDataMemory *pData; | |
387 | ||
388 | long dataLength; | |
389 | ||
390 | UDataInfo invUcaInfo; | |
391 | uprv_memcpy(&invUcaInfo, &invUcaDataInfo, sizeof(UDataInfo)); | |
392 | u_getUnicodeVersion(invUcaInfo.dataVersion); | |
393 | ||
374ca955 | 394 | pData=udata_create(outputDir, INVC_DATA_TYPE, INVC_DATA_NAME, &invUcaInfo, |
b75a7d8f A |
395 | copyright, status); |
396 | ||
397 | if(U_FAILURE(*status)) { | |
374ca955 | 398 | fprintf(stderr, "Error: unable to create %s"INVC_DATA_NAME", error %s\n", outputDir, u_errorName(*status)); |
b75a7d8f A |
399 | return; |
400 | } | |
401 | ||
402 | /* write the data to the file */ | |
403 | if (VERBOSE) { | |
404 | fprintf(stdout, "Writing out inverse UCA table: %s%c%s.%s\n", outputDir, U_FILE_SEP_CHAR, | |
374ca955 | 405 | INVC_DATA_NAME, |
b75a7d8f A |
406 | INVC_DATA_TYPE); |
407 | } | |
408 | udata_writeBlock(pData, data, data->byteSize); | |
409 | ||
410 | /* finish up */ | |
411 | dataLength=udata_finish(pData, status); | |
412 | if(U_FAILURE(*status)) { | |
413 | fprintf(stderr, "Error: error %d writing the output file\n", *status); | |
414 | return; | |
415 | } | |
416 | } | |
417 | ||
418 | ||
419 | ||
420 | static int32_t hex2num(char hex) { | |
421 | if(hex>='0' && hex <='9') { | |
422 | return hex-'0'; | |
423 | } else if(hex>='a' && hex<='f') { | |
424 | return hex-'a'+10; | |
425 | } else if(hex>='A' && hex<='F') { | |
426 | return hex-'A'+10; | |
427 | } else { | |
428 | return 0; | |
429 | } | |
430 | } | |
431 | ||
432 | UCAElements *readAnElement(FILE *data, tempUCATable *t, UCAConstants *consts, UErrorCode *status) { | |
433 | char buffer[2048], primary[100], secondary[100], tertiary[100]; | |
434 | UBool detectedContraction; | |
435 | int32_t i = 0; | |
436 | unsigned int theValue; | |
437 | char *pointer = NULL; | |
438 | char *commentStart = NULL; | |
439 | char *startCodePoint = NULL; | |
440 | char *endCodePoint = NULL; | |
441 | char *spacePointer = NULL; | |
46f4442e | 442 | char *dashPointer = NULL; |
b75a7d8f | 443 | char *result = fgets(buffer, 2048, data); |
374ca955 | 444 | int32_t buflen = (int32_t)uprv_strlen(buffer); |
b75a7d8f A |
445 | if(U_FAILURE(*status)) { |
446 | return 0; | |
447 | } | |
448 | *primary = *secondary = *tertiary = '\0'; | |
449 | if(result == NULL) { | |
450 | if(feof(data)) { | |
451 | return NULL; | |
452 | } else { | |
453 | fprintf(stderr, "empty line but no EOF!\n"); | |
454 | *status = U_INVALID_FORMAT_ERROR; | |
455 | return NULL; | |
456 | } | |
457 | } | |
458 | while(buflen>0 && (buffer[buflen-1] == '\r' || buffer[buflen-1] == '\n')) { | |
459 | buffer[--buflen] = 0; | |
460 | } | |
461 | ||
462 | if(buffer[0] == 0 || buffer[0] == '#') { | |
463 | return NULL; // just a comment, skip whole line | |
464 | } | |
465 | ||
466 | UCAElements *element = ≤ //(UCAElements *)malloc(sizeof(UCAElements)); | |
467 | ||
468 | enum ActionType { | |
469 | READCE, | |
470 | READHEX, | |
471 | READUCAVERSION | |
472 | }; | |
473 | ||
474 | // Directives. | |
475 | if(buffer[0] == '[') { | |
476 | uint32_t cnt = 0; | |
46f4442e A |
477 | static const struct { |
478 | char name[128]; | |
b75a7d8f A |
479 | uint32_t *what; |
480 | ActionType what_to_do; | |
481 | } vt[] = { {"[first tertiary ignorable", consts->UCA_FIRST_TERTIARY_IGNORABLE, READCE}, | |
482 | {"[last tertiary ignorable", consts->UCA_LAST_TERTIARY_IGNORABLE, READCE}, | |
483 | {"[first secondary ignorable", consts->UCA_FIRST_SECONDARY_IGNORABLE, READCE}, | |
484 | {"[last secondary ignorable", consts->UCA_LAST_SECONDARY_IGNORABLE, READCE}, | |
485 | {"[first primary ignorable", consts->UCA_FIRST_PRIMARY_IGNORABLE, READCE}, | |
486 | {"[last primary ignorable", consts->UCA_LAST_PRIMARY_IGNORABLE, READCE}, | |
487 | {"[first variable", consts->UCA_FIRST_VARIABLE, READCE}, | |
488 | {"[last variable", consts->UCA_LAST_VARIABLE, READCE}, | |
489 | {"[first regular", consts->UCA_FIRST_NON_VARIABLE, READCE}, | |
490 | {"[last regular", consts->UCA_LAST_NON_VARIABLE, READCE}, | |
491 | {"[first implicit", consts->UCA_FIRST_IMPLICIT, READCE}, | |
492 | {"[last implicit", consts->UCA_LAST_IMPLICIT, READCE}, | |
493 | {"[first trailing", consts->UCA_FIRST_TRAILING, READCE}, | |
494 | {"[last trailing", consts->UCA_LAST_TRAILING, READCE}, | |
495 | ||
496 | {"[fixed top", &consts->UCA_PRIMARY_TOP_MIN, READHEX}, | |
497 | {"[fixed first implicit byte", &consts->UCA_PRIMARY_IMPLICIT_MIN, READHEX}, | |
498 | {"[fixed last implicit byte", &consts->UCA_PRIMARY_IMPLICIT_MAX, READHEX}, | |
499 | {"[fixed first trail byte", &consts->UCA_PRIMARY_TRAILING_MIN, READHEX}, | |
500 | {"[fixed last trail byte", &consts->UCA_PRIMARY_TRAILING_MAX, READHEX}, | |
501 | {"[fixed first special byte", &consts->UCA_PRIMARY_SPECIAL_MIN, READHEX}, | |
502 | {"[fixed last special byte", &consts->UCA_PRIMARY_SPECIAL_MAX, READHEX}, | |
503 | {"[variable top = ", &t->options->variableTopValue, READHEX}, | |
504 | {"[UCA version = ", NULL, READUCAVERSION} | |
505 | }; | |
506 | for (cnt = 0; cnt<sizeof(vt)/sizeof(vt[0]); cnt++) { | |
507 | uint32_t vtLen = (uint32_t)uprv_strlen(vt[cnt].name); | |
508 | if(uprv_strncmp(buffer, vt[cnt].name, vtLen) == 0) { | |
509 | element->variableTop = TRUE; | |
510 | if(vt[cnt].what_to_do == READHEX) { | |
511 | if(sscanf(buffer+vtLen, "%4x", &theValue) != 1) /* read first code point */ | |
512 | { | |
513 | fprintf(stderr, " scanf(hex) failed on !\n "); | |
514 | } | |
515 | *(vt[cnt].what) = (UChar)theValue; | |
516 | //if(cnt == 1) { // first implicit | |
517 | // we need to set the value for top next | |
518 | //uint32_t nextTop = ucol_prv_calculateImplicitPrimary(0x4E00); // CJK base | |
519 | //consts->UCA_NEXT_TOP_VALUE = theValue<<24 | 0x030303; | |
520 | //} | |
521 | } else if (vt[cnt].what_to_do == READCE) { /* vt[cnt].what_to_do == READCE */ | |
522 | pointer = strchr(buffer+vtLen, '['); | |
523 | if(pointer) { | |
524 | pointer++; | |
525 | element->sizePrim[0]=readElement(&pointer, primary, ',', status); | |
526 | element->sizeSec[0]=readElement(&pointer, secondary, ',', status); | |
527 | element->sizeTer[0]=readElement(&pointer, tertiary, ']', status); | |
528 | ||
529 | vt[cnt].what[0] = getSingleCEValue(primary, secondary, tertiary, status); | |
530 | if(element->sizePrim[0] > 2 || element->sizeSec[0] > 1 || element->sizeTer[0] > 1) { | |
531 | uint32_t CEi = 1; | |
532 | uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ | |
533 | if(2*CEi<element->sizePrim[i]) { | |
534 | value |= ((hex2num(*(primary+4*CEi))&0xF)<<28); | |
535 | value |= ((hex2num(*(primary+4*CEi+1))&0xF)<<24); | |
536 | } | |
537 | ||
538 | if(2*CEi+1<element->sizePrim[i]) { | |
539 | value |= ((hex2num(*(primary+4*CEi+2))&0xF)<<20); | |
540 | value |= ((hex2num(*(primary+4*CEi+3))&0xF)<<16); | |
541 | } | |
542 | ||
543 | if(CEi<element->sizeSec[i]) { | |
544 | value |= ((hex2num(*(secondary+2*CEi))&0xF)<<12); | |
545 | value |= ((hex2num(*(secondary+2*CEi+1))&0xF)<<8); | |
546 | } | |
547 | ||
548 | if(CEi<element->sizeTer[i]) { | |
549 | value |= ((hex2num(*(tertiary+2*CEi))&0x3)<<4); | |
550 | value |= (hex2num(*(tertiary+2*CEi+1))&0xF); | |
551 | } | |
552 | ||
553 | CEi++; | |
554 | ||
555 | vt[cnt].what[1] = value; | |
556 | //element->CEs[CEindex++] = value; | |
557 | } else { | |
558 | vt[cnt].what[1] = 0; | |
559 | } | |
560 | } else { | |
561 | fprintf(stderr, "Failed to read a CE from line %s\n", buffer); | |
562 | } | |
563 | } else { //vt[cnt].what_to_do == READUCAVERSION | |
564 | u_versionFromString(UCAVersion, buffer+vtLen); | |
565 | if(VERBOSE) { | |
566 | fprintf(stdout, "UCA version [%hu.%hu.%hu.%hu]\n", UCAVersion[0], UCAVersion[1], UCAVersion[2], UCAVersion[3]); | |
567 | } | |
568 | } | |
569 | //element->cPoints[0] = (UChar)theValue; | |
570 | //return element; | |
571 | return NULL; | |
572 | } | |
573 | } | |
574 | fprintf(stderr, "Warning: unrecognized option: %s\n", buffer); | |
575 | //*status = U_INVALID_FORMAT_ERROR; | |
576 | return NULL; | |
577 | } | |
578 | element->variableTop = FALSE; | |
579 | ||
580 | startCodePoint = buffer; | |
581 | endCodePoint = strchr(startCodePoint, ';'); | |
582 | ||
583 | if(endCodePoint == 0) { | |
584 | fprintf(stderr, "error - line with no code point!\n"); | |
585 | *status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */ | |
586 | return NULL; | |
587 | } else { | |
588 | *(endCodePoint) = 0; | |
589 | } | |
590 | ||
46f4442e | 591 | memset(element, 0, sizeof(*element)); |
b75a7d8f A |
592 | |
593 | element->cPoints = element->uchars; | |
594 | ||
595 | spacePointer = strchr(buffer, ' '); | |
596 | if(sscanf(buffer, "%4x", &theValue) != 1) /* read first code point */ | |
597 | { | |
598 | fprintf(stderr, " scanf(hex) failed!\n "); | |
599 | } | |
600 | element->cPoints[0] = (UChar)theValue; | |
601 | ||
602 | if(spacePointer == 0) { | |
603 | detectedContraction = FALSE; | |
604 | element->cSize = 1; | |
605 | } else { | |
46f4442e A |
606 | dashPointer = strchr(buffer, '|'); |
607 | if (dashPointer != NULL) { | |
608 | // prefix characters | |
609 | element->prefixChars[0] = (UChar)theValue; | |
610 | element->prefixSize = 1; | |
611 | element->prefix = element->prefixChars; | |
612 | sscanf(dashPointer+1, "%4x", &theValue); | |
613 | element->cPoints[0] = (UChar)theValue; | |
614 | element->cSize = 1; | |
615 | } | |
616 | else { | |
617 | // Contractions or surrogate characters. | |
618 | i = 1; | |
619 | detectedContraction = TRUE; | |
620 | while(spacePointer != NULL) { | |
621 | sscanf(spacePointer+1, "%4x", &theValue); | |
622 | element->cPoints[i++] = (UChar)theValue; | |
623 | spacePointer = strchr(spacePointer+1, ' '); | |
624 | } | |
625 | element->cSize = i; | |
b75a7d8f A |
626 | } |
627 | ||
b75a7d8f A |
628 | |
629 | //fprintf(stderr, "Number of codepoints in contraction: %i\n", i); | |
630 | } | |
631 | ||
632 | startCodePoint = endCodePoint+1; | |
633 | ||
634 | commentStart = strchr(startCodePoint, '#'); | |
635 | if(commentStart == NULL) { | |
636 | commentStart = strlen(startCodePoint) + startCodePoint; | |
637 | } | |
638 | ||
639 | i = 0; | |
640 | uint32_t CEindex = 0; | |
641 | element->noOfCEs = 0; | |
642 | for(;;) { | |
643 | endCodePoint = strchr(startCodePoint, ']'); | |
644 | if(endCodePoint == NULL || endCodePoint >= commentStart) { | |
645 | break; | |
646 | } | |
647 | pointer = strchr(startCodePoint, '['); | |
648 | pointer++; | |
649 | ||
650 | element->sizePrim[i]=readElement(&pointer, primary, ',', status); | |
651 | element->sizeSec[i]=readElement(&pointer, secondary, ',', status); | |
652 | element->sizeTer[i]=readElement(&pointer, tertiary, ']', status); | |
653 | ||
654 | ||
655 | /* I want to get the CEs entered right here, including continuation */ | |
656 | element->CEs[CEindex++] = getSingleCEValue(primary, secondary, tertiary, status); | |
657 | ||
658 | uint32_t CEi = 1; | |
659 | while(2*CEi<element->sizePrim[i] || CEi<element->sizeSec[i] || CEi<element->sizeTer[i]) { | |
660 | uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ | |
661 | if(2*CEi<element->sizePrim[i]) { | |
662 | value |= ((hex2num(*(primary+4*CEi))&0xF)<<28); | |
663 | value |= ((hex2num(*(primary+4*CEi+1))&0xF)<<24); | |
664 | } | |
665 | ||
666 | if(2*CEi+1<element->sizePrim[i]) { | |
667 | value |= ((hex2num(*(primary+4*CEi+2))&0xF)<<20); | |
668 | value |= ((hex2num(*(primary+4*CEi+3))&0xF)<<16); | |
669 | } | |
670 | ||
671 | if(CEi<element->sizeSec[i]) { | |
672 | value |= ((hex2num(*(secondary+2*CEi))&0xF)<<12); | |
673 | value |= ((hex2num(*(secondary+2*CEi+1))&0xF)<<8); | |
674 | } | |
675 | ||
676 | if(CEi<element->sizeTer[i]) { | |
677 | value |= ((hex2num(*(tertiary+2*CEi))&0x3)<<4); | |
678 | value |= (hex2num(*(tertiary+2*CEi+1))&0xF); | |
679 | } | |
680 | ||
681 | CEi++; | |
682 | ||
683 | element->CEs[CEindex++] = value; | |
684 | } | |
685 | ||
686 | startCodePoint = endCodePoint+1; | |
687 | i++; | |
688 | } | |
689 | element->noOfCEs = CEindex; | |
73c04bcf | 690 | #if 0 |
b75a7d8f | 691 | element->isThai = UCOL_ISTHAIPREVOWEL(element->cPoints[0]); |
73c04bcf | 692 | #endif |
b75a7d8f | 693 | // we don't want any strange stuff after useful data! |
46f4442e A |
694 | if (pointer == NULL) { |
695 | /* huh? Did we get ']' without the '['? Pair your brackets! */ | |
696 | *status=U_INVALID_FORMAT_ERROR; | |
697 | } | |
698 | else { | |
699 | while(pointer < commentStart) { | |
700 | if(*pointer != ' ' && *pointer != '\t') | |
701 | { | |
702 | *status=U_INVALID_FORMAT_ERROR; | |
703 | break; | |
704 | } | |
705 | pointer++; | |
b75a7d8f | 706 | } |
b75a7d8f A |
707 | } |
708 | ||
709 | if(U_FAILURE(*status)) { | |
374ca955 | 710 | fprintf(stderr, "problem putting stuff in hash table %s\n", u_errorName(*status)); |
b75a7d8f A |
711 | *status = U_INTERNAL_PROGRAM_ERROR; |
712 | return NULL; | |
713 | } | |
714 | ||
715 | return element; | |
716 | } | |
717 | ||
718 | ||
719 | void writeOutData(UCATableHeader *data, | |
720 | UCAConstants *consts, | |
721 | UChar contractions[][3], | |
722 | uint32_t noOfcontractions, | |
723 | const char *outputDir, | |
724 | const char *copyright, | |
725 | UErrorCode *status) | |
726 | { | |
727 | if(U_FAILURE(*status)) { | |
728 | return; | |
729 | } | |
730 | ||
731 | uint32_t size = data->size; | |
732 | ||
374ca955 A |
733 | data->UCAConsts = data->size; |
734 | data->size += paddedsize(sizeof(UCAConstants)); | |
735 | ||
b75a7d8f A |
736 | if(noOfcontractions != 0) { |
737 | contractions[noOfcontractions][0] = 0; | |
738 | contractions[noOfcontractions][1] = 0; | |
739 | contractions[noOfcontractions][2] = 0; | |
740 | noOfcontractions++; | |
741 | ||
742 | ||
b75a7d8f | 743 | data->contractionUCACombos = data->size; |
374ca955 A |
744 | data->contractionUCACombosWidth = 3; |
745 | data->contractionUCACombosSize = noOfcontractions; | |
b75a7d8f A |
746 | data->size += paddedsize((noOfcontractions*3*sizeof(UChar))); |
747 | } | |
748 | ||
749 | UNewDataMemory *pData; | |
750 | ||
751 | long dataLength; | |
752 | UDataInfo ucaInfo; | |
753 | uprv_memcpy(&ucaInfo, &ucaDataInfo, sizeof(UDataInfo)); | |
754 | u_getUnicodeVersion(ucaInfo.dataVersion); | |
755 | ||
374ca955 | 756 | pData=udata_create(outputDir, UCA_DATA_TYPE, UCA_DATA_NAME, &ucaInfo, |
b75a7d8f A |
757 | copyright, status); |
758 | ||
759 | if(U_FAILURE(*status)) { | |
374ca955 | 760 | fprintf(stderr, "Error: unable to create %s"UCA_DATA_NAME", error %s\n", outputDir, u_errorName(*status)); |
b75a7d8f A |
761 | return; |
762 | } | |
763 | ||
764 | /* write the data to the file */ | |
765 | if (VERBOSE) { | |
766 | fprintf(stdout, "Writing out UCA table: %s%c%s.%s\n", outputDir, | |
767 | U_FILE_SEP_CHAR, | |
768 | U_ICUDATA_NAME "_" UCA_DATA_NAME, | |
769 | UCA_DATA_TYPE); | |
770 | } | |
771 | udata_writeBlock(pData, data, size); | |
772 | ||
773 | // output the constants here | |
774 | udata_writeBlock(pData, consts, sizeof(UCAConstants)); | |
775 | ||
776 | if(noOfcontractions != 0) { | |
777 | udata_writeBlock(pData, contractions, noOfcontractions*3*sizeof(UChar)); | |
778 | udata_writePadding(pData, paddedsize((noOfcontractions*3*sizeof(UChar))) - noOfcontractions*3*sizeof(uint16_t)); | |
779 | } | |
780 | ||
781 | /* finish up */ | |
782 | dataLength=udata_finish(pData, status); | |
783 | if(U_FAILURE(*status)) { | |
784 | fprintf(stderr, "Error: error %d writing the output file\n", *status); | |
785 | return; | |
786 | } | |
787 | } | |
788 | ||
789 | static int32_t | |
790 | write_uca_table(const char *filename, | |
791 | const char *outputDir, | |
792 | const char *copyright, | |
793 | UErrorCode *status) | |
794 | { | |
795 | FILE *data = fopen(filename, "r"); | |
73c04bcf A |
796 | if(data == NULL) { |
797 | fprintf(stderr, "Couldn't open file: %s\n", filename); | |
798 | return -1; | |
799 | } | |
b75a7d8f A |
800 | uint32_t line = 0; |
801 | UCAElements *element = NULL; | |
802 | UChar variableTopValue = 0; | |
803 | UCATableHeader *myD = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader)); | |
804 | /* test for NULL */ | |
374ca955 A |
805 | if(myD == NULL) { |
806 | *status = U_MEMORY_ALLOCATION_ERROR; | |
807 | fclose(data); | |
808 | return 0; | |
809 | } | |
810 | uprv_memset(myD, 0, sizeof(UCATableHeader)); | |
b75a7d8f A |
811 | UColOptionSet *opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); |
812 | /* test for NULL */ | |
374ca955 A |
813 | if(opts == NULL) { |
814 | *status = U_MEMORY_ALLOCATION_ERROR; | |
815 | uprv_free(myD); | |
816 | fclose(data); | |
817 | return 0; | |
818 | } | |
819 | uprv_memset(opts, 0, sizeof(UColOptionSet)); | |
73c04bcf A |
820 | UChar contractionCEs[512][3]; |
821 | uprv_memset(contractionCEs, 0, 512*3*sizeof(UChar)); | |
b75a7d8f A |
822 | uint32_t noOfContractions = 0; |
823 | UCAConstants consts; | |
374ca955 | 824 | uprv_memset(&consts, 0, sizeof(consts)); |
b75a7d8f A |
825 | #if 0 |
826 | UCAConstants consts = { | |
827 | UCOL_RESET_TOP_VALUE, | |
828 | UCOL_FIRST_PRIMARY_IGNORABLE, | |
829 | UCOL_LAST_PRIMARY_IGNORABLE, | |
830 | UCOL_LAST_PRIMARY_IGNORABLE_CONT, | |
831 | UCOL_FIRST_SECONDARY_IGNORABLE, | |
832 | UCOL_LAST_SECONDARY_IGNORABLE, | |
833 | UCOL_FIRST_TERTIARY_IGNORABLE, | |
834 | UCOL_LAST_TERTIARY_IGNORABLE, | |
835 | UCOL_FIRST_VARIABLE, | |
836 | UCOL_LAST_VARIABLE, | |
837 | UCOL_FIRST_NON_VARIABLE, | |
838 | UCOL_LAST_NON_VARIABLE, | |
839 | ||
840 | UCOL_NEXT_TOP_VALUE, | |
841 | /* | |
842 | UCOL_NEXT_FIRST_PRIMARY_IGNORABLE, | |
843 | UCOL_NEXT_LAST_PRIMARY_IGNORABLE, | |
844 | UCOL_NEXT_FIRST_SECONDARY_IGNORABLE, | |
845 | UCOL_NEXT_LAST_SECONDARY_IGNORABLE, | |
846 | UCOL_NEXT_FIRST_TERTIARY_IGNORABLE, | |
847 | UCOL_NEXT_LAST_TERTIARY_IGNORABLE, | |
848 | UCOL_NEXT_FIRST_VARIABLE, | |
849 | UCOL_NEXT_LAST_VARIABLE, | |
850 | */ | |
851 | ||
852 | PRIMARY_IMPLICIT_MIN, | |
853 | PRIMARY_IMPLICIT_MAX | |
854 | }; | |
855 | #endif | |
856 | ||
857 | ||
374ca955 | 858 | uprv_memset(inverseTable, 0xDA, sizeof(int32_t)*3*0xFFFF); |
b75a7d8f A |
859 | |
860 | opts->variableTopValue = variableTopValue; | |
861 | opts->strength = UCOL_TERTIARY; | |
862 | opts->frenchCollation = UCOL_OFF; | |
863 | opts->alternateHandling = UCOL_NON_IGNORABLE; /* attribute for handling variable elements*/ | |
864 | opts->caseFirst = UCOL_OFF; /* who goes first, lower case or uppercase */ | |
865 | opts->caseLevel = UCOL_OFF; /* do we have an extra case level */ | |
866 | opts->normalizationMode = UCOL_OFF; /* attribute for normalization */ | |
867 | opts->hiraganaQ = UCOL_OFF; /* attribute for JIS X 4061, used only in Japanese */ | |
374ca955 | 868 | opts->numericCollation = UCOL_OFF; |
b75a7d8f A |
869 | myD->jamoSpecial = FALSE; |
870 | ||
374ca955 | 871 | tempUCATable *t = uprv_uca_initTempTable(myD, opts, NULL, IMPLICIT_TAG, LEAD_SURROGATE_TAG, status); |
b75a7d8f A |
872 | if(U_FAILURE(*status)) |
873 | { | |
874 | fprintf(stderr, "Failed to init UCA temp table: %s\n", u_errorName(*status)); | |
73c04bcf A |
875 | uprv_free(opts); |
876 | uprv_free(myD); | |
877 | fclose(data); | |
b75a7d8f A |
878 | return -1; |
879 | } | |
880 | ||
881 | #if 0 | |
882 | IMPLICIT_TAG = 9, | |
883 | /* | |
884 | ***************************************************************************************** | |
885 | * NON_CHARACTER FDD0 - FDEF, FFFE, FFFF, 1FFFE, 1FFFF, 2FFFE, 2FFFF,...e.g. **FFFE, **FFFF | |
886 | ****************************************************************************************** | |
887 | */ | |
888 | #endif | |
889 | ||
890 | // * set to zero | |
891 | struct { | |
892 | UChar32 start; | |
893 | UChar32 end; | |
894 | int32_t value; | |
895 | } ranges[] = | |
896 | { | |
897 | #if 0 | |
898 | {0xAC00, 0xD7AF, UCOL_SPECIAL_FLAG | (HANGUL_SYLLABLE_TAG << 24) }, //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/ | |
899 | {0xD800, 0xDBFF, UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG << 24) }, //1 LEAD_SURROGATE_TAG, /* D800-DBFF*/ | |
900 | {0xDC00, 0xDFFF, UCOL_SPECIAL_FLAG | (TRAIL_SURROGATE_TAG << 24) }, //2 TRAIL_SURROGATE DC00-DFFF | |
901 | {0x3400, 0x4DB5, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //3 CJK_IMPLICIT_TAG, /* 0x3400-0x4DB5*/ | |
902 | {0x4E00, 0x9FA5, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //4 CJK_IMPLICIT_TAG, /* 0x4E00-0x9FA5*/ | |
903 | {0xF900, 0xFA2D, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //5 CJK_IMPLICIT_TAG, /* 0xF900-0xFA2D*/ | |
904 | {0x20000, 0x2A6D6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //6 CJK_IMPLICIT_TAG, /* 0x20000-0x2A6D6*/ | |
905 | {0x2F800, 0x2FA1D, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //7 CJK_IMPLICIT_TAG, /* 0x2F800-0x2FA1D*/ | |
906 | #endif | |
907 | {0xAC00, 0xD7B0, UCOL_SPECIAL_FLAG | (HANGUL_SYLLABLE_TAG << 24) }, //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/ | |
374ca955 | 908 | //{0xD800, 0xDC00, UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG << 24) }, //1 LEAD_SURROGATE_TAG, /* D800-DBFF*/ |
b75a7d8f A |
909 | {0xDC00, 0xE000, UCOL_SPECIAL_FLAG | (TRAIL_SURROGATE_TAG << 24) }, //2 TRAIL_SURROGATE DC00-DFFF |
910 | // Now directly handled in the collation code by the swapCJK function. | |
911 | //{0x3400, 0x4DB6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //3 CJK_IMPLICIT_TAG, /* 0x3400-0x4DB5*/ | |
912 | //{0x4E00, 0x9FA6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //4 CJK_IMPLICIT_TAG, /* 0x4E00-0x9FA5*/ | |
913 | //{0xF900, 0xFA2E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //5 CJK_IMPLICIT_TAG, /* 0xF900-0xFA2D*/ | |
914 | //{0x20000, 0x2A6D7, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //6 CJK_IMPLICIT_TAG, /* 0x20000-0x2A6D6*/ | |
915 | //{0x2F800, 0x2FA1E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //7 CJK_IMPLICIT_TAG, /* 0x2F800-0x2FA1D*/ | |
916 | }; | |
917 | uint32_t i = 0; | |
918 | ||
919 | for(i = 0; i<sizeof(ranges)/sizeof(ranges[0]); i++) { | |
920 | /*ucmpe32_setRange32(t->mapping, ranges[i].start, ranges[i].end, ranges[i].value); */ | |
921 | utrie_setRange32(t->mapping, ranges[i].start, ranges[i].end, ranges[i].value, TRUE); | |
922 | } | |
923 | ||
924 | ||
925 | int32_t surrogateCount = 0; | |
926 | while(!feof(data)) { | |
927 | if(U_FAILURE(*status)) { | |
374ca955 A |
928 | fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n", |
929 | *status, u_errorName(*status), (int)line, filename); | |
b75a7d8f A |
930 | exit(*status); |
931 | } | |
932 | ||
933 | element = readAnElement(data, t, &consts, status); | |
934 | line++; | |
935 | if(VERBOSE) { | |
374ca955 | 936 | fprintf(stdout, "%u ", (int)line); |
b75a7d8f A |
937 | } |
938 | if(element != NULL) { | |
939 | // we have read the line, now do something sensible with the read data! | |
940 | ||
941 | // Below stuff was taken care of in readAnElement | |
942 | //if(element->variableTop == TRUE && variableTopValue == 0) { | |
943 | // t->options->variableTopValue = element->cPoints[0]; | |
944 | //} | |
945 | ||
946 | // if element is a contraction, we want to add it to contractions | |
947 | if(element->cSize > 1 && element->cPoints[0] != 0xFDD0) { // this is a contraction | |
948 | if(UTF_IS_LEAD(element->cPoints[0]) && UTF_IS_TRAIL(element->cPoints[1]) && element->cSize == 2) { | |
949 | surrogateCount++; | |
950 | } else { | |
951 | contractionCEs[noOfContractions][0] = element->cPoints[0]; | |
952 | contractionCEs[noOfContractions][1] = element->cPoints[1]; | |
953 | if(element->cSize > 2) { // the third one | |
954 | contractionCEs[noOfContractions][2] = element->cPoints[2]; | |
955 | } else { | |
956 | contractionCEs[noOfContractions][2] = 0; | |
957 | } | |
958 | noOfContractions++; | |
959 | } | |
960 | } | |
46f4442e A |
961 | else { |
962 | // TODO (claireho): does this work? Need more tests | |
963 | // The following code is to handle the UCA pre-context rules | |
964 | // for L/l with middle dot. We share the structures for contractionCombos. | |
965 | // The format for pre-context character is | |
966 | // contractionCEs[0]: codepoint in element->cPoints[0] | |
967 | // contractionCEs[1]: '\0' to differentiate with contractions. | |
968 | // contractionCEs[2]: prefix char | |
969 | if (element->prefixSize>0) { | |
970 | contractionCEs[noOfContractions][0]=element->cPoints[0]; | |
971 | contractionCEs[noOfContractions][1]='\0'; | |
972 | contractionCEs[noOfContractions][2]=element->prefixChars[0]; | |
973 | noOfContractions++; | |
974 | } | |
975 | ||
976 | } | |
b75a7d8f A |
977 | |
978 | /* we're first adding to inverse, because addAnElement will reverse the order */ | |
979 | /* of code points and stuff... we don't want that to happen */ | |
980 | addToInverse(element, status); | |
981 | if(!(element->cSize > 1 && element->cPoints[0] == 0xFDD0)) { | |
982 | uprv_uca_addAnElement(t, element, status); | |
983 | } | |
984 | } | |
985 | } | |
986 | ||
987 | if(UCAVersion[0] == 0 && UCAVersion[1] == 0 && UCAVersion[2] == 0 && UCAVersion[3] == 0) { | |
73c04bcf A |
988 | fprintf(stderr, "UCA version not specified. Cannot create data file!\n"); |
989 | uprv_uca_closeTempTable(t); | |
990 | uprv_free(opts); | |
991 | uprv_free(myD); | |
992 | fclose(data); | |
993 | return -1; | |
b75a7d8f | 994 | } |
374ca955 A |
995 | /* { |
996 | uint32_t trieWord = utrie_get32(t->mapping, 0xDC01, NULL); | |
997 | }*/ | |
b75a7d8f A |
998 | |
999 | if (VERBOSE) { | |
374ca955 A |
1000 | fprintf(stdout, "\nLines read: %u\n", (int)line); |
1001 | fprintf(stdout, "Surrogate count: %i\n", (int)surrogateCount); | |
b75a7d8f A |
1002 | fprintf(stdout, "Raw data breakdown:\n"); |
1003 | /*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/ | |
374ca955 A |
1004 | fprintf(stdout, "Number of contractions: %u\n", (int)noOfContractions); |
1005 | fprintf(stdout, "Contraction image size: %u\n", (int)t->image->contractionSize); | |
1006 | fprintf(stdout, "Expansions size: %i\n", (int)t->expansions->position); | |
b75a7d8f A |
1007 | } |
1008 | ||
1009 | ||
1010 | /* produce canonical closure for table */ | |
1011 | /* first set up constants for implicit calculation */ | |
46f4442e | 1012 | uprv_uca_initImplicitConstants(status); |
b75a7d8f | 1013 | /* do the closure */ |
46f4442e | 1014 | int32_t noOfClosures = uprv_uca_canonicalClosure(t, NULL, status); |
b75a7d8f | 1015 | if(noOfClosures != 0) { |
374ca955 | 1016 | fprintf(stderr, "Warning: %i canonical closures occured!\n", (int)noOfClosures); |
b75a7d8f A |
1017 | } |
1018 | ||
1019 | /* test */ | |
1020 | UCATableHeader *myData = uprv_uca_assembleTable(t, status); | |
1021 | ||
1022 | if (VERBOSE) { | |
1023 | fprintf(stdout, "Compacted data breakdown:\n"); | |
1024 | /*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/ | |
374ca955 A |
1025 | fprintf(stdout, "Number of contractions: %u\n", (int)noOfContractions); |
1026 | fprintf(stdout, "Contraction image size: %u\n", (int)t->image->contractionSize); | |
1027 | fprintf(stdout, "Expansions size: %i\n", (int)t->expansions->position); | |
1028 | } | |
1029 | ||
1030 | if(U_FAILURE(*status)) { | |
1031 | fprintf(stderr, "Error creating table: %s\n", u_errorName(*status)); | |
73c04bcf A |
1032 | uprv_uca_closeTempTable(t); |
1033 | uprv_free(opts); | |
1034 | uprv_free(myD); | |
1035 | fclose(data); | |
374ca955 | 1036 | return -1; |
b75a7d8f A |
1037 | } |
1038 | ||
1039 | /* populate the version info struct with version info*/ | |
1040 | myData->version[0] = UCOL_BUILDER_VERSION; | |
1041 | myData->version[1] = UCAVersion[0]; | |
1042 | myData->version[2] = UCAVersion[1]; | |
1043 | myData->version[3] = UCAVersion[2]; | |
1044 | /*TODO:The fractional rules version should be taken from FractionalUCA.txt*/ | |
1045 | // Removed this macro. Instead, we use the fields below | |
1046 | //myD->version[1] = UCOL_FRACTIONAL_UCA_VERSION; | |
1047 | //myD->UCAVersion = UCAVersion; // out of FractionalUCA.txt | |
1048 | uprv_memcpy(myData->UCAVersion, UCAVersion, sizeof(UVersionInfo)); | |
1049 | u_getUnicodeVersion(myData->UCDVersion); | |
1050 | ||
1051 | writeOutData(myData, &consts, contractionCEs, noOfContractions, outputDir, copyright, status); | |
1052 | ||
1053 | InverseUCATableHeader *inverse = assembleInverseTable(status); | |
1054 | uprv_memcpy(inverse->UCAVersion, UCAVersion, sizeof(UVersionInfo)); | |
1055 | writeOutInverseData(inverse, outputDir, copyright, status); | |
1056 | ||
73c04bcf | 1057 | uprv_uca_closeTempTable(t); |
b75a7d8f A |
1058 | uprv_free(myD); |
1059 | uprv_free(opts); | |
1060 | ||
1061 | ||
1062 | uprv_free(myData); | |
1063 | uprv_free(inverse); | |
1064 | fclose(data); | |
1065 | ||
1066 | return 0; | |
1067 | } | |
1068 | ||
1069 | #endif /* #if !UCONFIG_NO_COLLATION */ | |
1070 | ||
1071 | static UOption options[]={ | |
1072 | UOPTION_HELP_H, /* 0 Numbers for those who*/ | |
1073 | UOPTION_HELP_QUESTION_MARK, /* 1 can't count. */ | |
1074 | UOPTION_COPYRIGHT, /* 2 */ | |
1075 | UOPTION_VERSION, /* 3 */ | |
1076 | UOPTION_DESTDIR, /* 4 */ | |
1077 | UOPTION_SOURCEDIR, /* 5 */ | |
1078 | UOPTION_VERBOSE, /* 6 */ | |
1079 | UOPTION_ICUDATADIR /* 7 */ | |
1080 | /* weiv can't count :))))) */ | |
1081 | }; | |
1082 | ||
1083 | int main(int argc, char* argv[]) { | |
1084 | UErrorCode status = U_ZERO_ERROR; | |
1085 | const char* destdir = NULL; | |
1086 | const char* srcDir = NULL; | |
1087 | char filename[300]; | |
1088 | char *basename = NULL; | |
1089 | const char *copyright = NULL; | |
1090 | uprv_memset(&UCAVersion, 0, 4); | |
1091 | ||
1092 | U_MAIN_INIT_ARGS(argc, argv); | |
1093 | ||
1094 | /* preset then read command line options */ | |
1095 | options[4].value=u_getDataDirectory(); | |
1096 | options[5].value=""; | |
1097 | argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); | |
1098 | ||
1099 | /* error handling, printing usage message */ | |
1100 | if(argc<0) { | |
1101 | fprintf(stderr, | |
1102 | "error in command line argument \"%s\"\n", | |
1103 | argv[-argc]); | |
1104 | } else if(argc<2) { | |
1105 | argc=-1; | |
1106 | } | |
1107 | if(options[0].doesOccur || options[1].doesOccur) { | |
1108 | fprintf(stderr, | |
1109 | "usage: %s [-options] file\n" | |
1110 | "\tRead in UCA collation text data and write out the binary collation data\n" | |
1111 | "options:\n" | |
1112 | "\t-h or -? or --help this usage text\n" | |
1113 | "\t-V or --version show a version message\n" | |
1114 | "\t-c or --copyright include a copyright notice\n" | |
1115 | "\t-d or --destdir destination directory, followed by the path\n" | |
1116 | "\t-s or --sourcedir source directory, followed by the path\n" | |
1117 | "\t-v or --verbose turn on verbose output\n" | |
1118 | "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" | |
1119 | "\t followed by path, defaults to %s\n", | |
1120 | argv[0], u_getDataDirectory()); | |
1121 | return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; | |
1122 | } | |
b75a7d8f | 1123 | if(options[3].doesOccur) { |
374ca955 | 1124 | fprintf(stdout, "genuca version %hu.%hu, ICU tool to read UCA text data and create UCA data tables for collation.\n", |
b75a7d8f A |
1125 | #if UCONFIG_NO_COLLATION |
1126 | 0, 0 | |
1127 | #else | |
374ca955 | 1128 | UCA_FORMAT_VERSION_0, UCA_FORMAT_VERSION_1 |
b75a7d8f A |
1129 | #endif |
1130 | ); | |
374ca955 | 1131 | fprintf(stdout, U_COPYRIGHT_STRING"\n"); |
b75a7d8f A |
1132 | exit(0); |
1133 | } | |
1134 | ||
1135 | /* get the options values */ | |
1136 | destdir = options[4].value; | |
1137 | srcDir = options[5].value; | |
1138 | VERBOSE = options[6].doesOccur; | |
1139 | ||
1140 | if (options[2].doesOccur) { | |
1141 | copyright = U_COPYRIGHT_STRING; | |
1142 | } | |
1143 | ||
1144 | if (options[7].doesOccur) { | |
1145 | u_setDataDirectory(options[7].value); | |
1146 | } | |
374ca955 A |
1147 | /* Initialize ICU */ |
1148 | u_init(&status); | |
1149 | if (U_FAILURE(status) && status != U_FILE_ACCESS_ERROR) { | |
1150 | fprintf(stderr, "%s: can not initialize ICU. status = %s\n", | |
1151 | argv[0], u_errorName(status)); | |
1152 | exit(1); | |
1153 | } | |
1154 | status = U_ZERO_ERROR; | |
1155 | ||
b75a7d8f A |
1156 | |
1157 | /* prepare the filename beginning with the source dir */ | |
1158 | uprv_strcpy(filename, srcDir); | |
1159 | basename=filename+uprv_strlen(filename); | |
1160 | ||
1161 | if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { | |
1162 | *basename++ = U_FILE_SEP_CHAR; | |
1163 | } | |
1164 | ||
1165 | if(argc < 0) { | |
1166 | uprv_strcpy(basename, "FractionalUCA.txt"); | |
1167 | } else { | |
1168 | argv++; | |
1169 | uprv_strcpy(basename, getLongPathname(*argv)); | |
1170 | } | |
1171 | ||
1172 | #if 0 | |
1173 | if(u_getCombiningClass(0x0053) == 0) | |
1174 | { | |
1175 | fprintf(stderr, "SEVERE ERROR: Normalization data is not functioning! Bailing out. Was not able to load unorm.dat.\n"); | |
1176 | exit(1); | |
1177 | } | |
1178 | #endif | |
1179 | ||
1180 | #if UCONFIG_NO_COLLATION | |
1181 | ||
1182 | UNewDataMemory *pData; | |
1183 | const char *msg; | |
1184 | ||
374ca955 | 1185 | msg = "genuca writes dummy " UCA_DATA_NAME "." UCA_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h"; |
b75a7d8f | 1186 | fprintf(stderr, "%s\n", msg); |
374ca955 | 1187 | pData = udata_create(destdir, UCA_DATA_TYPE, UCA_DATA_NAME, &dummyDataInfo, |
b75a7d8f A |
1188 | NULL, &status); |
1189 | udata_writeBlock(pData, msg, strlen(msg)); | |
1190 | udata_finish(pData, &status); | |
1191 | ||
374ca955 | 1192 | msg = "genuca writes dummy " INVC_DATA_NAME "." INVC_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h"; |
b75a7d8f | 1193 | fprintf(stderr, "%s\n", msg); |
374ca955 | 1194 | pData = udata_create(destdir, INVC_DATA_TYPE, INVC_DATA_NAME, &dummyDataInfo, |
b75a7d8f A |
1195 | NULL, &status); |
1196 | udata_writeBlock(pData, msg, strlen(msg)); | |
1197 | udata_finish(pData, &status); | |
1198 | ||
1199 | return (int)status; | |
1200 | ||
1201 | #else | |
1202 | ||
1203 | return write_uca_table(filename, destdir, copyright, &status); | |
1204 | ||
1205 | #endif | |
1206 | } | |
1207 | ||
1208 | /* | |
1209 | * Hey, Emacs, please set the following: | |
1210 | * | |
1211 | * Local Variables: | |
1212 | * indent-tabs-mode: nil | |
1213 | * End: | |
1214 | * | |
1215 | */ |