]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ********************************************************************** | |
374ca955 | 3 | * Copyright (c) 2002-2004, International Business Machines |
b75a7d8f A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | * Author: Alan Liu | |
7 | * Created: October 30 2002 | |
8 | * Since: ICU 2.4 | |
9 | ********************************************************************** | |
10 | */ | |
11 | #ifndef PROPNAME_H | |
12 | #define PROPNAME_H | |
13 | ||
14 | #include "unicode/utypes.h" | |
15 | #include "unicode/uchar.h" | |
374ca955 | 16 | #include "udataswp.h" |
b75a7d8f A |
17 | #include "uprops.h" |
18 | ||
374ca955 A |
19 | /* |
20 | * This header defines the in-memory layout of the property names data | |
21 | * structure representing the UCD data files PropertyAliases.txt and | |
22 | * PropertyValueAliases.txt. It is used by: | |
23 | * propname.cpp - reads data | |
24 | * genpname - creates data | |
25 | */ | |
b75a7d8f | 26 | |
374ca955 A |
27 | /* low-level char * property name comparison -------------------------------- */ |
28 | ||
29 | U_CDECL_BEGIN | |
b75a7d8f | 30 | |
374ca955 A |
31 | /** |
32 | * \var uprv_comparePropertyNames | |
33 | * Unicode property names and property value names are compared "loosely". | |
34 | * | |
35 | * UCD.html 4.0.1 says: | |
36 | * For all property names, property value names, and for property values for | |
37 | * Enumerated, Binary, or Catalog properties, use the following | |
38 | * loose matching rule: | |
39 | * | |
40 | * LM3. Ignore case, whitespace, underscore ('_'), and hyphens. | |
41 | * | |
42 | * This function does just that, for (char *) name strings. | |
43 | * It is almost identical to ucnv_compareNames() but also ignores | |
44 | * C0 White_Space characters (U+0009..U+000d, and U+0085 on EBCDIC). | |
45 | * | |
46 | * @internal | |
47 | */ | |
b75a7d8f | 48 | |
374ca955 A |
49 | U_CAPI int32_t U_EXPORT2 |
50 | uprv_compareASCIIPropertyNames(const char *name1, const char *name2); | |
51 | ||
52 | U_CAPI int32_t U_EXPORT2 | |
53 | uprv_compareEBCDICPropertyNames(const char *name1, const char *name2); | |
54 | ||
55 | #if U_CHARSET_FAMILY==U_ASCII_FAMILY | |
56 | # define uprv_comparePropertyNames uprv_compareASCIIPropertyNames | |
57 | #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY | |
58 | # define uprv_comparePropertyNames uprv_compareEBCDICPropertyNames | |
59 | #else | |
60 | # error U_CHARSET_FAMILY is not valid | |
61 | #endif | |
62 | ||
63 | U_CDECL_END | |
64 | ||
65 | /* UDataMemory structure and signatures ------------------------------------- */ | |
b75a7d8f A |
66 | |
67 | #define PNAME_DATA_NAME "pnames" | |
68 | #define PNAME_DATA_TYPE "icu" | |
69 | ||
374ca955 | 70 | /* Fields in UDataInfo: */ |
b75a7d8f | 71 | |
374ca955 | 72 | /* PNAME_SIG[] is encoded as numeric literals for compatibility with the HP compiler */ |
b75a7d8f A |
73 | #define PNAME_SIG_0 ((uint8_t)0x70) /* p */ |
74 | #define PNAME_SIG_1 ((uint8_t)0x6E) /* n */ | |
75 | #define PNAME_SIG_2 ((uint8_t)0x61) /* a */ | |
76 | #define PNAME_SIG_3 ((uint8_t)0x6D) /* m */ | |
77 | ||
78 | #define PNAME_FORMAT_VERSION ((int8_t)1) /* formatVersion[0] */ | |
79 | ||
374ca955 A |
80 | /** |
81 | * Swap pnames.icu. See udataswp.h. | |
82 | * @internal | |
83 | */ | |
84 | U_CAPI int32_t U_EXPORT2 | |
85 | upname_swap(const UDataSwapper *ds, | |
86 | const void *inData, int32_t length, void *outData, | |
87 | UErrorCode *pErrorCode); | |
88 | ||
89 | ||
90 | #ifdef XP_CPLUSPLUS | |
91 | ||
92 | class Builder; | |
93 | ||
94 | U_NAMESPACE_BEGIN | |
95 | ||
b75a7d8f A |
96 | /** |
97 | * An offset from the start of the pnames data to a contained entity. | |
98 | * This must be a signed value, since negative offsets are used as an | |
99 | * end-of-list marker. Offsets to actual objects are non-zero. A | |
100 | * zero offset indicates an absent entry; this corresponds to aliases | |
101 | * marked "n/a" in the original Unicode data files. | |
102 | */ | |
374ca955 | 103 | typedef int16_t Offset; /* must be signed */ |
b75a7d8f A |
104 | |
105 | #define MAX_OFFSET 0x7FFF | |
106 | ||
107 | /** | |
108 | * A generic value for a property or property value. Typically an | |
109 | * enum from uchar.h, but sometimes a non-enum value. It must be | |
110 | * large enough to accomodate the largest enum value, which as of this | |
111 | * writing is the largest general category mask. Need not be signed | |
112 | * but may be. Typically it doesn't matter, since the caller will | |
113 | * cast it to the proper type before use. Takes the special value | |
114 | * UCHAR_INVALID_CODE for invalid input. | |
115 | */ | |
116 | typedef int32_t EnumValue; | |
117 | ||
374ca955 A |
118 | /* ---------------------------------------------------------------------- */ |
119 | /* ValueMap */ | |
b75a7d8f A |
120 | |
121 | /** | |
122 | * For any top-level property that has named values (binary and | |
123 | * enumerated properties), there is a ValueMap object. This object | |
124 | * maps from enum values to two other maps. One goes from value enums | |
125 | * to value names. The other goes from value names to value enums. | |
126 | * | |
127 | * The value enum values may be contiguous or disjoint. If they are | |
128 | * contiguous then the enumToName_offset is nonzero, and the | |
129 | * ncEnumToName_offset is zero. Vice versa if the value enums are | |
130 | * disjoint. | |
131 | * | |
132 | * There are n of these objects, where n is the number of binary | |
133 | * properties + the number of enumerated properties. | |
134 | */ | |
135 | struct ValueMap { | |
136 | ||
374ca955 A |
137 | /* -- begin pnames data -- */ |
138 | /* Enum=>name EnumToOffset / NonContiguousEnumToOffset objects. */ | |
139 | /* Exactly one of these will be nonzero. */ | |
b75a7d8f A |
140 | Offset enumToName_offset; |
141 | Offset ncEnumToName_offset; | |
142 | ||
374ca955 A |
143 | Offset nameToEnum_offset; /* Name=>enum data */ |
144 | /* -- end pnames data -- */ | |
b75a7d8f A |
145 | }; |
146 | ||
374ca955 A |
147 | /* ---------------------------------------------------------------------- */ |
148 | /* PropertyAliases class */ | |
b75a7d8f A |
149 | |
150 | /** | |
151 | * A class encapsulating access to the memory-mapped data representing | |
152 | * property aliases and property value aliases (pnames). The class | |
153 | * MUST have no v-table and declares certain methods inline -- small | |
154 | * methods and methods that are called from only one point. | |
155 | * | |
156 | * The data members in this class correspond to the in-memory layout | |
157 | * of the header of the pnames data. | |
158 | */ | |
159 | class PropertyAliases { | |
160 | ||
374ca955 A |
161 | /* -- begin pnames data -- */ |
162 | /* Enum=>name EnumToOffset object for binary and enumerated */ | |
163 | /* properties */ | |
b75a7d8f A |
164 | Offset enumToName_offset; |
165 | ||
374ca955 | 166 | /* Name=>enum data for binary & enumerated properties */ |
b75a7d8f A |
167 | Offset nameToEnum_offset; |
168 | ||
374ca955 A |
169 | /* Enum=>offset EnumToOffset object mapping enumerated properties */ |
170 | /* to ValueMap objects */ | |
b75a7d8f A |
171 | Offset enumToValue_offset; |
172 | ||
374ca955 A |
173 | /* The following are needed by external readers of this data. */ |
174 | /* We don't use them ourselves. */ | |
175 | int16_t total_size; /* size in bytes excluding the udata header */ | |
176 | Offset valueMap_offset; /* offset to start of array */ | |
177 | int16_t valueMap_count; /* number of entries */ | |
178 | Offset nameGroupPool_offset; /* offset to start of array */ | |
179 | int16_t nameGroupPool_count; /* number of entries (not groups) */ | |
180 | Offset stringPool_offset; /* offset to start of pool */ | |
181 | int16_t stringPool_count; /* number of strings (not size in bytes) */ | |
b75a7d8f | 182 | |
374ca955 | 183 | /* -- end pnames data -- */ |
b75a7d8f A |
184 | |
185 | friend class ::Builder; | |
186 | ||
187 | const ValueMap* getValueMap(EnumValue prop) const; | |
188 | ||
189 | const char* chooseNameInGroup(Offset offset, | |
190 | UPropertyNameChoice choice) const; | |
191 | ||
192 | public: | |
193 | ||
194 | inline const int8_t* getPointer(Offset o) const { | |
195 | return ((const int8_t*) this) + o; | |
196 | } | |
197 | ||
198 | inline const int8_t* getPointerNull(Offset o) const { | |
199 | return o ? getPointer(o) : NULL; | |
200 | } | |
201 | ||
202 | inline const char* getPropertyName(EnumValue prop, | |
203 | UPropertyNameChoice choice) const; | |
204 | ||
205 | inline EnumValue getPropertyEnum(const char* alias) const; | |
206 | ||
207 | inline const char* getPropertyValueName(EnumValue prop, EnumValue value, | |
208 | UPropertyNameChoice choice) const; | |
209 | ||
210 | inline EnumValue getPropertyValueEnum(EnumValue prop, | |
211 | const char* alias) const; | |
374ca955 A |
212 | |
213 | static int32_t | |
214 | swap(const UDataSwapper *ds, | |
215 | const uint8_t *inBytes, int32_t length, uint8_t *outBytes, | |
216 | UErrorCode *pErrorCode); | |
b75a7d8f A |
217 | }; |
218 | ||
374ca955 A |
219 | /* ---------------------------------------------------------------------- */ |
220 | /* EnumToOffset */ | |
b75a7d8f A |
221 | |
222 | /** | |
223 | * A generic map from enum values to Offsets. The enum values must be | |
224 | * contiguous, from enumStart to enumLimit. The Offset values may | |
225 | * point to anything. | |
226 | */ | |
227 | class EnumToOffset { | |
228 | ||
374ca955 | 229 | /* -- begin pnames data -- */ |
b75a7d8f A |
230 | EnumValue enumStart; |
231 | EnumValue enumLimit; | |
374ca955 A |
232 | Offset _offsetArray; /* [array of enumLimit-enumStart] */ |
233 | /* -- end pnames data -- */ | |
b75a7d8f A |
234 | |
235 | friend class ::Builder; | |
236 | ||
237 | Offset* getOffsetArray() { | |
238 | return &_offsetArray; | |
239 | } | |
240 | ||
241 | const Offset* getOffsetArray() const { | |
242 | return &_offsetArray; | |
243 | } | |
244 | ||
245 | static int32_t getSize(int32_t n) { | |
246 | return sizeof(EnumToOffset) + sizeof(Offset) * (n - 1); | |
247 | } | |
248 | ||
374ca955 A |
249 | int32_t getSize() { |
250 | return getSize(enumLimit - enumStart); | |
251 | } | |
252 | ||
b75a7d8f A |
253 | public: |
254 | ||
255 | Offset getOffset(EnumValue enumProbe) const { | |
256 | if (enumProbe < enumStart || | |
257 | enumProbe >= enumLimit) { | |
374ca955 | 258 | return 0; /* not found */ |
b75a7d8f A |
259 | } |
260 | const Offset* p = getOffsetArray(); | |
261 | return p[enumProbe - enumStart]; | |
262 | } | |
374ca955 A |
263 | |
264 | static int32_t | |
265 | swap(const UDataSwapper *ds, | |
266 | const uint8_t *inBytes, int32_t length, uint8_t *outBytes, | |
267 | uint8_t *temp, int32_t pos, | |
268 | UErrorCode *pErrorCode); | |
b75a7d8f A |
269 | }; |
270 | ||
374ca955 A |
271 | /* ---------------------------------------------------------------------- */ |
272 | /* NonContiguousEnumToOffset */ | |
b75a7d8f A |
273 | |
274 | /** | |
275 | * A generic map from enum values to Offsets. The enum values may be | |
276 | * disjoint. If they are contiguous, an EnumToOffset should be used | |
277 | * instead. The Offset values may point to anything. | |
278 | */ | |
279 | class NonContiguousEnumToOffset { | |
280 | ||
374ca955 | 281 | /* -- begin pnames data -- */ |
b75a7d8f | 282 | int32_t count; |
374ca955 A |
283 | EnumValue _enumArray; /* [array of count] */ |
284 | /* Offset _offsetArray; // [array of count] after enumValue[count-1] */ | |
285 | /* -- end pnames data -- */ | |
b75a7d8f A |
286 | |
287 | friend class ::Builder; | |
288 | ||
289 | EnumValue* getEnumArray() { | |
290 | return &_enumArray; | |
291 | } | |
292 | ||
293 | const EnumValue* getEnumArray() const { | |
294 | return &_enumArray; | |
295 | } | |
296 | ||
297 | Offset* getOffsetArray() { | |
298 | return (Offset*) (getEnumArray() + count); | |
299 | } | |
300 | ||
301 | const Offset* getOffsetArray() const { | |
302 | return (Offset*) (getEnumArray() + count); | |
303 | } | |
304 | ||
305 | static int32_t getSize(int32_t n) { | |
306 | return sizeof(int32_t) + (sizeof(EnumValue) + sizeof(Offset)) * n; | |
307 | } | |
308 | ||
374ca955 A |
309 | int32_t getSize() { |
310 | return getSize(count); | |
311 | } | |
312 | ||
b75a7d8f A |
313 | public: |
314 | ||
315 | Offset getOffset(EnumValue enumProbe) const { | |
316 | const EnumValue* e = getEnumArray(); | |
317 | const Offset* p = getOffsetArray(); | |
374ca955 A |
318 | /* linear search; binary later if warranted */ |
319 | /* (binary is not faster for short lists) */ | |
b75a7d8f A |
320 | for (int32_t i=0; i<count; ++i) { |
321 | if (e[i] < enumProbe) continue; | |
322 | if (e[i] > enumProbe) break; | |
323 | return p[i]; | |
324 | } | |
374ca955 | 325 | return 0; /* not found */ |
b75a7d8f | 326 | } |
374ca955 A |
327 | |
328 | static int32_t | |
329 | swap(const UDataSwapper *ds, | |
330 | const uint8_t *inBytes, int32_t length, uint8_t *outBytes, | |
331 | uint8_t *temp, int32_t pos, | |
332 | UErrorCode *pErrorCode); | |
b75a7d8f A |
333 | }; |
334 | ||
374ca955 A |
335 | /* ---------------------------------------------------------------------- */ |
336 | /* NameToEnum */ | |
b75a7d8f A |
337 | |
338 | /** | |
339 | * A map from names to enum values. | |
340 | */ | |
341 | class NameToEnum { | |
342 | ||
374ca955 A |
343 | /* -- begin pnames data -- */ |
344 | int32_t count; /* number of entries */ | |
345 | EnumValue _enumArray; /* [array of count] EnumValues */ | |
346 | /* Offset _nameArray; // [array of count] offsets to names */ | |
347 | /* -- end pnames data -- */ | |
b75a7d8f A |
348 | |
349 | friend class ::Builder; | |
350 | ||
351 | EnumValue* getEnumArray() { | |
352 | return &_enumArray; | |
353 | } | |
354 | ||
355 | const EnumValue* getEnumArray() const { | |
356 | return &_enumArray; | |
357 | } | |
358 | ||
359 | Offset* getNameArray() { | |
360 | return (Offset*) (getEnumArray() + count); | |
361 | } | |
362 | ||
363 | const Offset* getNameArray() const { | |
364 | return (Offset*) (getEnumArray() + count); | |
365 | } | |
366 | ||
367 | static int32_t getSize(int32_t n) { | |
368 | return sizeof(int32_t) + (sizeof(Offset) + sizeof(EnumValue)) * n; | |
369 | } | |
370 | ||
374ca955 A |
371 | int32_t getSize() { |
372 | return getSize(count); | |
373 | } | |
374 | ||
b75a7d8f A |
375 | public: |
376 | ||
377 | EnumValue getEnum(const char* alias, const PropertyAliases& data) const { | |
378 | ||
379 | const Offset* n = getNameArray(); | |
380 | const EnumValue* e = getEnumArray(); | |
381 | ||
374ca955 A |
382 | /* linear search; binary later if warranted */ |
383 | /* (binary is not faster for short lists) */ | |
b75a7d8f A |
384 | for (int32_t i=0; i<count; ++i) { |
385 | const char* name = (const char*) data.getPointer(n[i]); | |
386 | int32_t c = uprv_comparePropertyNames(alias, name); | |
387 | if (c > 0) continue; | |
388 | if (c < 0) break; | |
389 | return e[i]; | |
390 | } | |
391 | ||
392 | return UCHAR_INVALID_CODE; | |
393 | } | |
374ca955 A |
394 | |
395 | static int32_t | |
396 | swap(const UDataSwapper *ds, | |
397 | const uint8_t *inBytes, int32_t length, uint8_t *outBytes, | |
398 | uint8_t *temp, int32_t pos, | |
399 | UErrorCode *pErrorCode); | |
b75a7d8f A |
400 | }; |
401 | ||
402 | /*---------------------------------------------------------------------- | |
403 | * | |
404 | * In-memory layout. THIS IS NOT A STANDALONE DOCUMENT. It goes | |
405 | * together with above C++ declarations and gives an overview. | |
406 | * | |
407 | * See above for definitions of Offset and EnumValue. Also, refer to | |
408 | * above class declarations for the "bottom line" on data layout. | |
409 | * | |
410 | * Sizes: | |
411 | * '*_offset' is an Offset (see above) | |
412 | * 'count' members are typically int32_t (see above declarations) | |
413 | * 'enumArray' is an array of EnumValue (see above) | |
414 | * 'offsetArray' is an array of Offset (see above) | |
415 | * 'nameArray' is an array of Offset (see above) | |
416 | * 'enum*' is an EnumValue (see above) | |
417 | * '*Array [x n]' means that *Array has n elements | |
418 | * | |
419 | * References: | |
420 | * Instead of pointers, this flat data structure contains offsets. | |
421 | * All offsets are relative to the start of 'header'. A notation | |
422 | * is used to indicate what structure each offset points to: | |
423 | * 'foo (>x)' the offset(s) in foo point to structure x | |
424 | * | |
425 | * Structures: | |
426 | * Each structure is assigned a number, except for the header, | |
427 | * which is called 'header'. The numbers are not contiguous | |
428 | * for historical reasons. Some structures have sub-parts | |
429 | * that are denoted with a letter, e.g., "5a". | |
430 | * | |
431 | * BEGIN LAYOUT | |
432 | * ============ | |
433 | * header: | |
434 | * enumToName_offset (>0) | |
435 | * nameToEnum_offset (>2) | |
436 | * enumToValue_offset (>3) | |
437 | * (alignment padding build in to header) | |
374ca955 A |
438 | * |
439 | * The header also contains the following, used by "external readers" | |
440 | * like ICU4J and icuswap. | |
441 | * | |
442 | * // The following are needed by external readers of this data. | |
443 | * // We don't use them ourselves. | |
444 | * int16_t total_size; // size in bytes excluding the udata header | |
445 | * Offset valueMap_offset; // offset to start of array | |
446 | * int16_t valueMap_count; // number of entries | |
447 | * Offset nameGroupPool_offset; // offset to start of array | |
448 | * int16_t nameGroupPool_count; // number of entries (not groups) | |
449 | * Offset stringPool_offset; // offset to start of pool | |
450 | * int16_t stringPool_count; // number of strings (not size in bytes) | |
451 | * | |
b75a7d8f A |
452 | * 0: # NonContiguousEnumToOffset obj for props => name groups |
453 | * count | |
454 | * enumArray [x count] | |
455 | * offsetArray [x count] (>98) | |
456 | * | |
457 | * => pad to next 4-byte boundary | |
458 | * | |
459 | * (1: omitted -- no longer used) | |
460 | * | |
461 | * 2: # NameToEnum obj for binary & enumerated props | |
462 | * count | |
463 | * enumArray [x count] | |
464 | * nameArray [x count] (>99) | |
465 | * | |
466 | * => pad to next 4-byte boundary | |
467 | * | |
468 | * 3: # NonContiguousEnumToOffset obj for enumerated props => ValueMaps | |
469 | * count | |
470 | * enumArray [x count] | |
471 | * offsetArray [x count] (>4) | |
472 | * | |
473 | * => pad to next 4-byte boundary | |
474 | * | |
475 | * 4: # ValueMap array [x one for each enumerated prop i] | |
476 | * enumToName_offset (>5a +2*i) one of these two is NULL, one is not | |
477 | * ncEnumToName_offset (>5b +2*i) | |
478 | * nameToEnums_offset (>6 +2*i) | |
479 | * | |
480 | * => pad to next 4-byte boundary | |
481 | * | |
482 | * for each enumerated prop (either 5a or 5b): | |
483 | * | |
484 | * 5a: # EnumToOffset for enumerated prop's values => name groups | |
485 | * enumStart | |
486 | * enumLimit | |
487 | * offsetArray [x enumLimit - enumStart] (>98) | |
488 | * | |
489 | * => pad to next 4-byte boundary | |
490 | * | |
491 | * 5b: # NonContiguousEnumToOffset for enumerated prop's values => name groups | |
492 | * count | |
493 | * enumArray [x count] | |
494 | * offsetArray [x count] (>98) | |
495 | * | |
496 | * => pad to next 4-byte boundary | |
497 | * | |
498 | * 6: # NameToEnum for enumerated prop's values | |
499 | * count | |
500 | * enumArray [x count] | |
501 | * nameArray [x count] (>99) | |
502 | * | |
503 | * => pad to next 4-byte boundary | |
504 | * | |
505 | * 98: # name group pool {NGP} | |
506 | * [array of Offset values] (>99) | |
507 | * | |
508 | * 99: # string pool {SP} | |
509 | * [pool of nul-terminated char* strings] | |
510 | */ | |
511 | U_NAMESPACE_END | |
512 | ||
374ca955 | 513 | #endif /* C++ */ |
b75a7d8f | 514 | |
374ca955 | 515 | #endif |