]>
Commit | Line | Data |
---|---|---|
9385eb3d A |
1 | /*- |
2 | * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> | |
3 | * at Electronni Visti IA, Kiev, Ukraine. | |
4 | * All rights reserved. | |
5 | * | |
6 | * Redistribution and use in source and binary forms, with or without | |
7 | * modification, are permitted provided that the following conditions | |
8 | * are met: | |
9 | * 1. Redistributions of source code must retain the above copyright | |
10 | * notice, this list of conditions and the following disclaimer. | |
11 | * 2. Redistributions in binary form must reproduce the above copyright | |
12 | * notice, this list of conditions and the following disclaimer in the | |
13 | * documentation and/or other materials provided with the distribution. | |
14 | * | |
15 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND | |
16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE | |
19 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
20 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
21 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
22 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
23 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
24 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
25 | * SUCH DAMAGE. | |
26 | */ | |
27 | ||
28 | #include <sys/cdefs.h> | |
1f2f436a | 29 | __FBSDID("$FreeBSD: src/lib/libc/locale/collate.c,v 1.35 2005/02/27 20:31:13 ru Exp $"); |
9385eb3d | 30 | |
ad3c9f2a A |
31 | #include "xlocale_private.h" |
32 | /* assumes the locale_t variable is named loc */ | |
33 | #define __collate_chain_equiv_table (loc->__lc_collate->__chain_equiv_table) | |
34 | #define __collate_chain_pri_table (loc->__lc_collate->__chain_pri_table) | |
35 | #define __collate_char_pri_table (loc->__lc_collate->__char_pri_table) | |
36 | #define __collate_info (&loc->__lc_collate->__info) | |
37 | #define __collate_large_char_pri_table (loc->__lc_collate->__large_char_pri_table) | |
38 | #define __collate_substitute_table (loc->__lc_collate->__substitute_table) | |
39 | ||
9385eb3d A |
40 | #include "namespace.h" |
41 | #include <arpa/inet.h> | |
42 | #include <stdio.h> | |
43 | #include <stdlib.h> | |
ad3c9f2a | 44 | #include <stddef.h> |
9385eb3d | 45 | #include <string.h> |
ad3c9f2a | 46 | #include <wchar.h> |
9385eb3d A |
47 | #include <errno.h> |
48 | #include <unistd.h> | |
49 | #include <sysexits.h> | |
ad3c9f2a | 50 | #include <ctype.h> |
9385eb3d A |
51 | #include "un-namespace.h" |
52 | ||
53 | #include "collate.h" | |
54 | #include "setlocale.h" | |
55 | #include "ldpart.h" | |
56 | ||
57 | #include "libc_private.h" | |
58 | ||
ad3c9f2a A |
59 | #if __DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN |
60 | static void wntohl(wchar_t *, int); | |
61 | #endif /* __DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN */ | |
9385eb3d A |
62 | void __collate_err(int ex, const char *f) __dead2; |
63 | ||
ad3c9f2a A |
64 | /* |
65 | * Normally, the __collate_* routines should all be __private_extern__, | |
66 | * but grep is using them (3715846). Until we can provide an alternative, | |
67 | * we leave them public, and provide a read-only __collate_load_error variable | |
68 | */ | |
69 | #undef __collate_load_error | |
70 | int __collate_load_error = 1; | |
71 | ||
72 | __private_extern__ int | |
73 | __collate_load_tables(const char *encoding, locale_t loc) | |
9385eb3d | 74 | { |
b061a43b | 75 | int fd; |
9385eb3d | 76 | FILE *fp; |
ad3c9f2a | 77 | int i, saverr, chains, z; |
9385eb3d | 78 | char strbuf[STR_LEN], buf[PATH_MAX]; |
ad3c9f2a A |
79 | struct __xlocale_st_collate *TMP; |
80 | static struct __xlocale_st_collate *cache = NULL; | |
81 | struct __collate_st_info info; | |
82 | void *vp; | |
9385eb3d A |
83 | |
84 | /* 'encoding' must be already checked. */ | |
85 | if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { | |
ad3c9f2a A |
86 | loc->__collate_load_error = 1; |
87 | if (loc == &__global_locale) | |
88 | __collate_load_error = 1; | |
89 | XL_RELEASE(loc->__lc_collate); | |
90 | loc->__lc_collate = NULL; | |
9385eb3d A |
91 | return (_LDP_CACHE); |
92 | } | |
93 | ||
94 | /* | |
95 | * If the locale name is the same as our cache, use the cache. | |
96 | */ | |
ad3c9f2a A |
97 | if (cache && strcmp(encoding, cache->__encoding) == 0) { |
98 | loc->__collate_load_error = 0; | |
99 | if (loc == &__global_locale) | |
100 | __collate_load_error = 0; | |
101 | XL_RELEASE(loc->__lc_collate); | |
102 | loc->__lc_collate = cache; | |
103 | XL_RETAIN(loc->__lc_collate); | |
9385eb3d A |
104 | return (_LDP_CACHE); |
105 | } | |
106 | ||
107 | /* | |
108 | * Slurp the locale file into the cache. | |
109 | */ | |
110 | ||
111 | /* 'PathLocale' must be already set & checked. */ | |
112 | /* Range checking not needed, encoding has fixed size */ | |
974e3884 | 113 | (void)strcpy(buf, encoding); |
9385eb3d | 114 | (void)strcat(buf, "/LC_COLLATE"); |
b061a43b A |
115 | fd = __open_path_locale(buf); |
116 | if (fd == -1) { | |
117 | return (_LDP_ERROR); | |
118 | } | |
119 | if ((fp = fdopen(fd, "r")) == NULL) { | |
120 | close(fd); | |
9385eb3d | 121 | return (_LDP_ERROR); |
974e3884 | 122 | } |
9385eb3d A |
123 | |
124 | if (fread(strbuf, sizeof(strbuf), 1, fp) != 1) { | |
125 | saverr = errno; | |
126 | (void)fclose(fp); | |
127 | errno = saverr; | |
128 | return (_LDP_ERROR); | |
129 | } | |
130 | chains = -1; | |
ad3c9f2a | 131 | if (strcmp(strbuf, COLLATE_VERSION1_1A) == 0) |
9385eb3d A |
132 | chains = 1; |
133 | if (chains < 0) { | |
134 | (void)fclose(fp); | |
135 | errno = EFTYPE; | |
136 | return (_LDP_ERROR); | |
137 | } | |
138 | if (chains) { | |
ad3c9f2a | 139 | if (fread(&info, sizeof(info), 1, fp) != 1) { |
9385eb3d A |
140 | saverr = errno; |
141 | (void)fclose(fp); | |
142 | errno = saverr; | |
143 | return (_LDP_ERROR); | |
144 | } | |
ad3c9f2a A |
145 | #if __DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN |
146 | for(z = 0; z < info.directive_count; z++) { | |
147 | info.undef_pri[z] = ntohl(info.undef_pri[z]); | |
148 | info.subst_count[z] = ntohl(info.subst_count[z]); | |
149 | } | |
150 | info.chain_count = ntohl(info.chain_count); | |
151 | info.large_pri_count = ntohl(info.large_pri_count); | |
152 | #endif /* __DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN */ | |
153 | if ((chains = info.chain_count) < 0) { | |
9385eb3d A |
154 | (void)fclose(fp); |
155 | errno = EFTYPE; | |
156 | return (_LDP_ERROR); | |
157 | } | |
158 | } else | |
159 | chains = TABLE_SIZE; | |
160 | ||
ad3c9f2a A |
161 | i = sizeof(struct __xlocale_st_collate) |
162 | + sizeof(struct __collate_st_chain_pri) * chains | |
163 | + sizeof(struct __collate_st_large_char_pri) * info.large_pri_count; | |
164 | for(z = 0; z < info.directive_count; z++) | |
165 | i += sizeof(struct __collate_st_subst) * info.subst_count[z]; | |
166 | if ((TMP = (struct __xlocale_st_collate *)malloc(i)) == NULL) { | |
9385eb3d A |
167 | saverr = errno; |
168 | (void)fclose(fp); | |
169 | errno = saverr; | |
170 | return (_LDP_ERROR); | |
171 | } | |
ad3c9f2a A |
172 | TMP->__refcount = 2; /* one for the locale, one for the cache */ |
173 | TMP->__free_extra = NULL; | |
9385eb3d A |
174 | |
175 | #define FREAD(a, b, c, d) \ | |
176 | { \ | |
177 | if (fread(a, b, c, d) != c) { \ | |
178 | saverr = errno; \ | |
ad3c9f2a | 179 | free(TMP); \ |
9385eb3d A |
180 | (void)fclose(d); \ |
181 | errno = saverr; \ | |
182 | return (_LDP_ERROR); \ | |
183 | } \ | |
184 | } | |
185 | ||
ad3c9f2a A |
186 | /* adjust size to read the remaining in one chunk */ |
187 | i -= offsetof(struct __xlocale_st_collate, __char_pri_table); | |
188 | FREAD(TMP->__char_pri_table, i, 1, fp); | |
9385eb3d A |
189 | (void)fclose(fp); |
190 | ||
ad3c9f2a A |
191 | vp = (void *)(TMP + 1); |
192 | ||
193 | /* the COLLATE_SUBST_DUP optimization relies on COLL_WEIGHTS_MAX == 2 */ | |
194 | if (info.subst_count[0] > 0) { | |
195 | TMP->__substitute_table[0] = (struct __collate_st_subst *)vp; | |
196 | vp += info.subst_count[0] * sizeof(struct __collate_st_subst); | |
197 | } else | |
198 | TMP->__substitute_table[0] = NULL; | |
199 | if (info.flags & COLLATE_SUBST_DUP) | |
200 | TMP->__substitute_table[1] = TMP->__substitute_table[0]; | |
201 | else if (info.subst_count[1] > 0) { | |
202 | TMP->__substitute_table[1] = (struct __collate_st_subst *)vp; | |
203 | vp += info.subst_count[1] * sizeof(struct __collate_st_subst); | |
204 | } else | |
205 | TMP->__substitute_table[1] = NULL; | |
206 | ||
207 | if (chains > 0) { | |
208 | TMP->__chain_pri_table = (struct __collate_st_chain_pri *)vp; | |
209 | vp += chains * sizeof(struct __collate_st_chain_pri); | |
210 | } else | |
211 | TMP->__chain_pri_table = NULL; | |
212 | if (info.large_pri_count > 0) | |
213 | TMP->__large_char_pri_table = (struct __collate_st_large_char_pri *)vp; | |
214 | else | |
215 | TMP->__large_char_pri_table = NULL; | |
216 | ||
217 | #if __DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN | |
218 | { | |
219 | struct __collate_st_char_pri *p = TMP->__char_pri_table; | |
220 | for(i = UCHAR_MAX + 1; i-- > 0; p++) { | |
221 | for(z = 0; z < info.directive_count; z++) | |
222 | p->pri[z] = ntohl(p->pri[z]); | |
223 | } | |
224 | } | |
225 | for(z = 0; z < info.directive_count; z++) | |
226 | if (info.subst_count[z] > 0) { | |
227 | struct __collate_st_subst *p = TMP->__substitute_table[z]; | |
228 | for(i = info.subst_count[z]; i-- > 0; p++) { | |
229 | p->val = ntohl(p->val); | |
230 | wntohl(p->str, STR_LEN); | |
231 | } | |
232 | } | |
233 | { | |
234 | struct __collate_st_chain_pri *p = TMP->__chain_pri_table; | |
235 | for(i = chains; i-- > 0; p++) { | |
236 | wntohl(p->str, STR_LEN); | |
237 | for(z = 0; z < info.directive_count; z++) | |
238 | p->pri[z] = ntohl(p->pri[z]); | |
239 | } | |
240 | } | |
241 | if (info.large_pri_count > 0) { | |
242 | struct __collate_st_large_char_pri *p = TMP->__large_char_pri_table; | |
243 | for(i = info.large_pri_count; i-- > 0; p++) { | |
244 | p->val = ntohl(p->val); | |
245 | for(z = 0; z < info.directive_count; z++) | |
246 | p->pri.pri[z] = ntohl(p->pri.pri[z]); | |
9385eb3d A |
247 | } |
248 | } | |
ad3c9f2a A |
249 | #endif /* __DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN */ |
250 | (void)strcpy(TMP->__encoding, encoding); | |
251 | (void)memcpy(&TMP->__info, &info, sizeof(info)); | |
252 | XL_RELEASE(cache); | |
253 | cache = TMP; | |
254 | XL_RELEASE(loc->__lc_collate); | |
255 | loc->__lc_collate = cache; | |
256 | /* no need to retain, since we set __refcount to 2 above */ | |
257 | ||
258 | loc->__collate_substitute_nontrivial = (info.subst_count[0] > 0 || info.subst_count[1] > 0); | |
259 | loc->__collate_load_error = 0; | |
260 | if (loc == &__global_locale) | |
261 | __collate_load_error = 0; | |
9385eb3d A |
262 | |
263 | return (_LDP_LOADED); | |
264 | } | |
265 | ||
ad3c9f2a A |
266 | static int |
267 | __collate_wcsnlen(const wchar_t *s, int len) | |
268 | { | |
269 | int n = 0; | |
270 | while (*s && n < len) { | |
271 | s++; | |
272 | n++; | |
273 | } | |
274 | return n; | |
275 | } | |
276 | ||
277 | static struct __collate_st_subst * | |
278 | substsearch(const wchar_t key, struct __collate_st_subst *tab, int n) | |
279 | { | |
280 | int low = 0; | |
281 | int high = n - 1; | |
282 | int next, compar; | |
283 | struct __collate_st_subst *p; | |
284 | ||
285 | while (low <= high) { | |
286 | next = (low + high) / 2; | |
287 | p = tab + next; | |
288 | compar = key - p->val; | |
289 | if (compar == 0) | |
290 | return p; | |
291 | if (compar > 0) | |
292 | low = next + 1; | |
293 | else | |
294 | high = next - 1; | |
295 | } | |
296 | return NULL; | |
297 | } | |
298 | ||
299 | __private_extern__ wchar_t * | |
300 | __collate_substitute(const wchar_t *s, int which, locale_t loc) | |
9385eb3d A |
301 | { |
302 | int dest_len, len, nlen; | |
ad3c9f2a A |
303 | int n, delta, nsubst; |
304 | wchar_t *dest_str = NULL; | |
305 | const wchar_t *fp; | |
306 | struct __collate_st_subst *subst, *match; | |
9385eb3d A |
307 | |
308 | if (s == NULL || *s == '\0') | |
ad3c9f2a A |
309 | return (__collate_wcsdup(L"")); |
310 | dest_len = wcslen(s); | |
311 | nsubst = __collate_info->subst_count[which]; | |
312 | if (nsubst <= 0) | |
313 | return __collate_wcsdup(s); | |
314 | subst = __collate_substitute_table[which]; | |
315 | delta = dest_len / 4; | |
316 | if (delta < 2) | |
317 | delta = 2; | |
318 | dest_str = (wchar_t *)malloc((dest_len += delta) * sizeof(wchar_t)); | |
9385eb3d | 319 | if (dest_str == NULL) |
3d9156a7 | 320 | __collate_err(EX_OSERR, __func__); |
9385eb3d A |
321 | len = 0; |
322 | while (*s) { | |
ad3c9f2a A |
323 | if ((match = substsearch(*s, subst, nsubst)) != NULL) { |
324 | fp = match->str; | |
325 | n = __collate_wcsnlen(fp, STR_LEN); | |
326 | } else { | |
327 | fp = s; | |
328 | n = 1; | |
329 | } | |
330 | nlen = len + n; | |
9385eb3d | 331 | if (dest_len <= nlen) { |
ad3c9f2a | 332 | dest_str = reallocf(dest_str, (dest_len = nlen + delta) * sizeof(wchar_t)); |
9385eb3d | 333 | if (dest_str == NULL) |
3d9156a7 | 334 | __collate_err(EX_OSERR, __func__); |
9385eb3d | 335 | } |
ad3c9f2a A |
336 | wcsncpy(dest_str + len, fp, n); |
337 | len += n; | |
338 | s++; | |
9385eb3d | 339 | } |
ad3c9f2a | 340 | dest_str[len] = 0; |
9385eb3d A |
341 | return (dest_str); |
342 | } | |
343 | ||
ad3c9f2a A |
344 | static struct __collate_st_chain_pri * |
345 | chainsearch(const wchar_t *key, int *len, locale_t loc) | |
346 | { | |
347 | int low = 0; | |
348 | int high = __collate_info->chain_count - 1; | |
349 | int next, compar, l; | |
350 | struct __collate_st_chain_pri *p; | |
351 | struct __collate_st_chain_pri *tab = __collate_chain_pri_table; | |
352 | ||
353 | while (low <= high) { | |
354 | next = (low + high) / 2; | |
355 | p = tab + next; | |
356 | compar = *key - *p->str; | |
357 | if (compar == 0) { | |
358 | l = __collate_wcsnlen(p->str, STR_LEN); | |
359 | compar = wcsncmp(key, p->str, l); | |
360 | if (compar == 0) { | |
361 | *len = l; | |
362 | return p; | |
363 | } | |
364 | } | |
365 | if (compar > 0) | |
366 | low = next + 1; | |
367 | else | |
368 | high = next - 1; | |
369 | } | |
370 | return NULL; | |
371 | } | |
372 | ||
373 | static struct __collate_st_large_char_pri * | |
374 | largesearch(const wchar_t key, locale_t loc) | |
375 | { | |
376 | int low = 0; | |
377 | int high = __collate_info->large_pri_count - 1; | |
378 | int next, compar; | |
379 | struct __collate_st_large_char_pri *p; | |
380 | struct __collate_st_large_char_pri *tab = __collate_large_char_pri_table; | |
381 | ||
382 | while (low <= high) { | |
383 | next = (low + high) / 2; | |
384 | p = tab + next; | |
385 | compar = key - p->val; | |
386 | if (compar == 0) | |
387 | return p; | |
388 | if (compar > 0) | |
389 | low = next + 1; | |
390 | else | |
391 | high = next - 1; | |
392 | } | |
393 | return NULL; | |
394 | } | |
395 | ||
a9aaacca A |
396 | /* |
397 | * This is provided for programs (like grep) that are calling this | |
398 | * private function. This is also used by wcscoll() | |
399 | */ | |
400 | void | |
ad3c9f2a | 401 | __collate_lookup_l(const wchar_t *t, int *len, int *prim, int *sec, locale_t loc) |
9385eb3d A |
402 | { |
403 | struct __collate_st_chain_pri *p2; | |
ad3c9f2a | 404 | int l; |
9385eb3d | 405 | |
a9aaacca A |
406 | if (!*t) { |
407 | *len = 0; | |
408 | *prim = 0; | |
409 | *sec = 0; | |
410 | return; | |
411 | } | |
412 | ||
413 | NORMALIZE_LOCALE(loc); | |
414 | if (loc->__collate_load_error) { | |
415 | *len = 1; | |
416 | *prim = *t; | |
417 | *sec = 0; | |
418 | return; | |
419 | } | |
420 | ||
9385eb3d A |
421 | *len = 1; |
422 | *prim = *sec = 0; | |
ad3c9f2a A |
423 | p2 = chainsearch(t, &l, loc); |
424 | /* use the chain if prim >= 0 */ | |
425 | if (p2 && p2->pri[0] >= 0) { | |
426 | *len = l; | |
427 | *prim = p2->pri[0]; | |
428 | *sec = p2->pri[1]; | |
429 | return; | |
430 | } | |
431 | if (*t <= UCHAR_MAX) { | |
432 | *prim = __collate_char_pri_table[*t].pri[0]; | |
433 | *sec = __collate_char_pri_table[*t].pri[1]; | |
434 | return; | |
435 | } | |
436 | if (__collate_info->large_pri_count > 0) { | |
437 | struct __collate_st_large_char_pri *match; | |
438 | match = largesearch(*t, loc); | |
439 | if (match) { | |
440 | *prim = match->pri.pri[0]; | |
441 | *sec = match->pri.pri[1]; | |
442 | return; | |
443 | } | |
444 | } | |
445 | *prim = (l = __collate_info->undef_pri[0]) >= 0 ? l : *t - l; | |
446 | *sec = (l = __collate_info->undef_pri[1]) >= 0 ? l : *t - l; | |
447 | } | |
448 | ||
449 | /* | |
a9aaacca A |
450 | * This is also provided for programs (like grep) that are calling this |
451 | * private function - that do not perform their own multi-byte handling. | |
452 | * This will go away eventually. | |
ad3c9f2a A |
453 | */ |
454 | void | |
455 | __collate_lookup(const unsigned char *t, int *len, int *prim, int *sec) | |
456 | { | |
457 | locale_t loc = __current_locale(); | |
a9aaacca | 458 | wchar_t *w = NULL; |
ad3c9f2a A |
459 | int sverrno; |
460 | ||
a9aaacca A |
461 | if (!*t) { |
462 | *len = 0; | |
463 | *prim = 0; | |
464 | *sec = 0; | |
465 | return; | |
466 | } | |
467 | ||
468 | if (loc->__collate_load_error || (w = __collate_mbstowcs((const char *)t, loc)) == NULL) { | |
469 | *len = 1; | |
470 | *prim = (int)*t; | |
471 | *sec = 0; | |
472 | ||
473 | sverrno = errno; | |
474 | free((void*)w); | |
475 | errno = sverrno; | |
476 | return; | |
477 | } | |
478 | ||
ad3c9f2a A |
479 | __collate_lookup_l(w, len, prim, sec, loc); |
480 | sverrno = errno; | |
481 | free(w); | |
482 | errno = sverrno; | |
483 | } | |
a9aaacca | 484 | |
ad3c9f2a A |
485 | __private_extern__ void |
486 | __collate_lookup_which(const wchar_t *t, int *len, int *pri, int which, locale_t loc) | |
487 | { | |
488 | struct __collate_st_chain_pri *p2; | |
489 | int p, l; | |
490 | ||
491 | *len = 1; | |
492 | *pri = 0; | |
493 | p2 = chainsearch(t, &l, loc); | |
494 | if (p2) { | |
495 | p = p2->pri[which]; | |
496 | /* use the chain if pri >= 0 */ | |
497 | if (p >= 0) { | |
498 | *len = l; | |
499 | *pri = p; | |
500 | return; | |
501 | } | |
502 | } | |
503 | if (*t <= UCHAR_MAX) { | |
504 | *pri = __collate_char_pri_table[*t].pri[which]; | |
505 | return; | |
506 | } | |
507 | if (__collate_info->large_pri_count > 0) { | |
508 | struct __collate_st_large_char_pri *match; | |
509 | match = largesearch(*t, loc); | |
510 | if (match) { | |
511 | *pri = match->pri.pri[which]; | |
9385eb3d A |
512 | return; |
513 | } | |
514 | } | |
ad3c9f2a | 515 | *pri = (l = __collate_info->undef_pri[which]) >= 0 ? l : *t - l; |
9385eb3d A |
516 | } |
517 | ||
ad3c9f2a A |
518 | __private_extern__ wchar_t * |
519 | __collate_mbstowcs(const char *s, locale_t loc) | |
9385eb3d | 520 | { |
ad3c9f2a A |
521 | static const mbstate_t initial; |
522 | mbstate_t st; | |
523 | size_t len; | |
524 | const char *ss; | |
525 | wchar_t *wcs; | |
9385eb3d | 526 | |
ad3c9f2a A |
527 | ss = s; |
528 | st = initial; | |
529 | if ((len = mbsrtowcs_l(NULL, &ss, 0, &st, loc)) == (size_t)-1) | |
530 | return NULL; | |
531 | if ((wcs = (wchar_t *)malloc((len + 1) * sizeof(wchar_t))) == NULL) | |
3d9156a7 | 532 | __collate_err(EX_OSERR, __func__); |
ad3c9f2a A |
533 | st = initial; |
534 | mbsrtowcs_l(wcs, &s, len, &st, loc); | |
535 | wcs[len] = 0; | |
536 | ||
537 | return (wcs); | |
9385eb3d A |
538 | } |
539 | ||
ad3c9f2a A |
540 | __private_extern__ wchar_t * |
541 | __collate_wcsdup(const wchar_t *s) | |
542 | { | |
543 | size_t len = wcslen(s) + 1; | |
544 | wchar_t *wcs; | |
545 | ||
546 | if ((wcs = (wchar_t *)malloc(len * sizeof(wchar_t))) == NULL) | |
547 | __collate_err(EX_OSERR, __func__); | |
548 | wcscpy(wcs, s); | |
549 | return (wcs); | |
550 | } | |
551 | ||
552 | __private_extern__ void | |
553 | __collate_xfrm(const wchar_t *src, wchar_t **xf, locale_t loc) | |
554 | { | |
555 | int pri, len; | |
556 | size_t slen; | |
557 | const wchar_t *t; | |
558 | wchar_t *tt = NULL, *tr = NULL; | |
559 | int direc, pass; | |
560 | wchar_t *xfp; | |
561 | struct __collate_st_info *info = __collate_info; | |
562 | int sverrno; | |
563 | ||
564 | for(pass = 0; pass < COLL_WEIGHTS_MAX; pass++) | |
565 | xf[pass] = NULL; | |
566 | for(pass = 0; pass < info->directive_count; pass++) { | |
567 | direc = info->directive[pass]; | |
568 | if (pass == 0 || !(info->flags & COLLATE_SUBST_DUP)) { | |
569 | sverrno = errno; | |
570 | free(tt); | |
571 | errno = sverrno; | |
572 | tt = __collate_substitute(src, pass, loc); | |
573 | } | |
574 | if (direc & DIRECTIVE_BACKWARD) { | |
575 | wchar_t *bp, *fp, c; | |
576 | sverrno = errno; | |
577 | free(tr); | |
578 | errno = sverrno; | |
579 | tr = __collate_wcsdup(tt ? tt : src); | |
580 | bp = tr; | |
581 | fp = tr + wcslen(tr) - 1; | |
582 | while(bp < fp) { | |
583 | c = *bp; | |
584 | *bp++ = *fp; | |
585 | *fp-- = c; | |
586 | } | |
587 | t = (const wchar_t *)tr; | |
588 | } else if (tt) | |
589 | t = (const wchar_t *)tt; | |
590 | else | |
591 | t = (const wchar_t *)src; | |
592 | sverrno = errno; | |
593 | if ((xf[pass] = (wchar_t *)malloc(sizeof(wchar_t) * (wcslen(t) + 1))) == NULL) { | |
594 | errno = sverrno; | |
595 | slen = 0; | |
596 | goto end; | |
597 | } | |
598 | errno = sverrno; | |
599 | xfp = xf[pass]; | |
600 | if (direc & DIRECTIVE_POSITION) { | |
601 | while(*t) { | |
602 | __collate_lookup_which(t, &len, &pri, pass, loc); | |
603 | t += len; | |
604 | if (pri <= 0) { | |
605 | if (pri < 0) { | |
606 | errno = EINVAL; | |
607 | slen = 0; | |
608 | goto end; | |
609 | } | |
610 | pri = COLLATE_MAX_PRIORITY; | |
611 | } | |
612 | *xfp++ = pri; | |
613 | } | |
614 | } else { | |
615 | while(*t) { | |
616 | __collate_lookup_which(t, &len, &pri, pass, loc); | |
617 | t += len; | |
618 | if (pri <= 0) { | |
619 | if (pri < 0) { | |
620 | errno = EINVAL; | |
621 | slen = 0; | |
622 | goto end; | |
623 | } | |
624 | continue; | |
625 | } | |
626 | *xfp++ = pri; | |
627 | } | |
628 | } | |
629 | *xfp = 0; | |
630 | } | |
631 | end: | |
632 | sverrno = errno; | |
633 | free(tt); | |
634 | free(tr); | |
635 | errno = sverrno; | |
636 | } | |
637 | ||
638 | __private_extern__ void | |
9385eb3d A |
639 | __collate_err(int ex, const char *f) |
640 | { | |
641 | const char *s; | |
642 | int serrno = errno; | |
643 | ||
644 | s = _getprogname(); | |
645 | _write(STDERR_FILENO, s, strlen(s)); | |
646 | _write(STDERR_FILENO, ": ", 2); | |
647 | s = f; | |
648 | _write(STDERR_FILENO, s, strlen(s)); | |
649 | _write(STDERR_FILENO, ": ", 2); | |
650 | s = strerror(serrno); | |
651 | _write(STDERR_FILENO, s, strlen(s)); | |
652 | _write(STDERR_FILENO, "\n", 1); | |
653 | exit(ex); | |
654 | } | |
655 | ||
ad3c9f2a A |
656 | /* |
657 | * __collate_collating_symbol takes the multibyte string specified by | |
658 | * src and slen, and using ps, converts that to a wide character. Then | |
659 | * it is checked to verify it is a collating symbol, and then copies | |
660 | * it to the wide character string specified by dst and dlen (the | |
661 | * results are not null terminated). The length of the wide characters | |
662 | * copied to dst is returned if successful. Zero is returned if no such | |
663 | * collating symbol exists. (size_t)-1 is returned if there are wide-character | |
664 | * conversion errors, if the length of the converted string is greater that | |
665 | * STR_LEN or if dlen is too small. It is up to the calling routine to | |
666 | * preserve the mbstate_t structure as needed. | |
667 | */ | |
668 | __private_extern__ size_t | |
669 | __collate_collating_symbol(wchar_t *dst, size_t dlen, const char *src, size_t slen, mbstate_t *ps, locale_t loc) | |
670 | { | |
671 | wchar_t wname[STR_LEN]; | |
672 | wchar_t w, *wp; | |
673 | size_t len, l; | |
674 | ||
675 | /* POSIX locale */ | |
676 | if (loc->__collate_load_error) { | |
677 | if (dlen < 1) | |
678 | return (size_t)-1; | |
679 | if (slen != 1 || !isascii(*src)) | |
680 | return 0; | |
681 | *dst = *src; | |
682 | return 1; | |
683 | } | |
684 | for(wp = wname, len = 0; slen > 0; len++) { | |
685 | l = mbrtowc_l(&w, src, slen, ps, loc); | |
686 | if (l == (size_t)-1 || l == (size_t)-2) | |
687 | return (size_t)-1; | |
688 | if (l == 0) | |
689 | break; | |
690 | if (len >= STR_LEN) | |
691 | return -1; | |
692 | *wp++ = w; | |
693 | src += l; | |
694 | slen = (long)slen - (long)l; | |
695 | } | |
696 | if (len == 0 || len > dlen) | |
697 | return (size_t)-1; | |
698 | if (len == 1) { | |
699 | if (*wname <= UCHAR_MAX) { | |
700 | if (__collate_char_pri_table[*wname].pri[0] >= 0) { | |
701 | if (dlen > 0) | |
702 | *dst = *wname; | |
703 | return 1; | |
704 | } | |
705 | return 0; | |
706 | } else if (__collate_info->large_pri_count > 0) { | |
707 | struct __collate_st_large_char_pri *match; | |
708 | match = largesearch(*wname, loc); | |
709 | if (match && match->pri.pri[0] >= 0) { | |
710 | if (dlen > 0) | |
711 | *dst = *wname; | |
712 | return 1; | |
713 | } | |
714 | } | |
715 | return 0; | |
716 | } | |
717 | *wp = 0; | |
718 | if (__collate_info->chain_count > 0) { | |
719 | struct __collate_st_chain_pri *match; | |
720 | int ll; | |
721 | match = chainsearch(wname, &ll, loc); | |
722 | if (match) { | |
723 | if (ll < dlen) | |
724 | dlen = ll; | |
725 | wcsncpy(dst, wname, dlen); | |
726 | return ll; | |
727 | } | |
728 | } | |
729 | return 0; | |
730 | } | |
731 | ||
732 | /* | |
733 | * __collate_equiv_class returns the equivalence class number for the symbol | |
734 | * specified by src and slen, using ps to convert from multi-byte to wide | |
735 | * character. Zero is returned if the symbol is not in an equivalence | |
736 | * class. -1 is returned if there are wide character conversion error, | |
737 | * if there are any greater-than-8-bit characters or if a multi-byte symbol | |
738 | * is greater or equal to STR_LEN in length. It is up to the calling | |
739 | * routine to preserve the mbstate_t structure as needed. | |
740 | */ | |
741 | __private_extern__ int | |
742 | __collate_equiv_class(const char *src, size_t slen, mbstate_t *ps, locale_t loc) | |
743 | { | |
744 | wchar_t wname[STR_LEN]; | |
745 | wchar_t w, *wp; | |
746 | size_t len, l; | |
747 | int e; | |
748 | ||
749 | /* POSIX locale */ | |
750 | if (loc->__collate_load_error) | |
751 | return 0; | |
752 | for(wp = wname, len = 0; slen > 0; len++) { | |
753 | l = mbrtowc_l(&w, src, slen, ps, loc); | |
754 | if (l == (size_t)-1 || l == (size_t)-2) | |
755 | return -1; | |
756 | if (l == 0) | |
757 | break; | |
758 | if (len >= STR_LEN) | |
759 | return -1; | |
760 | *wp++ = w; | |
761 | src += l; | |
762 | slen = (long)slen - (long)l; | |
763 | } | |
764 | if (len == 0) | |
765 | return -1; | |
766 | if (len == 1) { | |
767 | e = -1; | |
768 | if (*wname <= UCHAR_MAX) | |
769 | e = __collate_char_pri_table[*wname].pri[0]; | |
770 | else if (__collate_info->large_pri_count > 0) { | |
771 | struct __collate_st_large_char_pri *match; | |
772 | match = largesearch(*wname, loc); | |
773 | if (match) | |
774 | e = match->pri.pri[0]; | |
775 | } | |
776 | if (e == 0) | |
777 | return IGNORE_EQUIV_CLASS; | |
778 | return e > 0 ? e : 0; | |
779 | } | |
780 | *wp = 0; | |
781 | if (__collate_info->chain_count > 0) { | |
782 | struct __collate_st_chain_pri *match; | |
783 | int ll; | |
784 | match = chainsearch(wname, &ll, loc); | |
785 | if (match) { | |
786 | e = match->pri[0]; | |
787 | if (e == 0) | |
788 | return IGNORE_EQUIV_CLASS; | |
789 | return e < 0 ? -e : e; | |
790 | } | |
791 | } | |
792 | return 0; | |
793 | } | |
794 | ||
795 | /* | |
796 | * __collate_equiv_match tries to match any single or multi-character symbol | |
797 | * in equivalence class equiv_class in the multi-byte string specified by src | |
798 | * and slen. If start is non-zero, it is taken to be the first (pre-converted) | |
799 | * wide character. Subsequence wide characters, if needed, will use ps in | |
800 | * the conversion. On a successful match, the length of the matched string | |
801 | * is returned (including the start character). If dst is non-NULL, the | |
802 | * matched wide-character string is copied to dst, a wide character array of | |
803 | * length dlen (the results are not zero-terminated). If rlen is non-NULL, | |
804 | * the number of character in src actually used is returned. Zero is | |
805 | * returned by __collate_equiv_match if there is no match. (size_t)-1 is | |
806 | * returned on error: if there were conversion errors or if dlen is too small | |
807 | * to accept the results. On no match or error, ps is restored to its incoming | |
808 | * state. | |
809 | */ | |
810 | size_t | |
811 | __collate_equiv_match(int equiv_class, wchar_t *dst, size_t dlen, wchar_t start, const char *src, size_t slen, mbstate_t *ps, size_t *rlen, locale_t loc) | |
812 | { | |
813 | wchar_t w; | |
814 | size_t len, l, clen; | |
815 | int i; | |
816 | wchar_t buf[STR_LEN], *wp; | |
817 | mbstate_t save; | |
818 | const char *s = src; | |
819 | size_t sl = slen; | |
820 | struct __collate_st_chain_pri *ch = NULL; | |
821 | ||
822 | /* POSIX locale */ | |
823 | if (loc->__collate_load_error) | |
824 | return (size_t)-1; | |
825 | if (equiv_class == IGNORE_EQUIV_CLASS) | |
826 | equiv_class = 0; | |
827 | if (ps) | |
828 | save = *ps; | |
829 | wp = buf; | |
830 | len = clen = 0; | |
831 | if (start) { | |
832 | *wp++ = start; | |
833 | len = 1; | |
834 | } | |
835 | /* convert up to the max chain length */ | |
836 | while(sl > 0 && len < __collate_info->chain_max_len) { | |
837 | l = mbrtowc_l(&w, s, sl, ps, loc); | |
838 | if (l == (size_t)-1 || l == (size_t)-2 || l == 0) | |
839 | break; | |
840 | *wp++ = w; | |
841 | s += l; | |
842 | clen += l; | |
843 | sl -= l; | |
844 | len++; | |
845 | } | |
846 | *wp = 0; | |
847 | if (len > 1 && (ch = chainsearch(buf, &i, loc)) != NULL) { | |
848 | int e = ch->pri[0]; | |
849 | if (e < 0) | |
850 | e = -e; | |
851 | if (e == equiv_class) | |
852 | goto found; | |
853 | } | |
854 | /* try single character */ | |
855 | i = 1; | |
856 | if (*buf <= UCHAR_MAX) { | |
857 | if (equiv_class == __collate_char_pri_table[*buf].pri[0]) | |
858 | goto found; | |
859 | } else if (__collate_info->large_pri_count > 0) { | |
860 | struct __collate_st_large_char_pri *match; | |
861 | match = largesearch(*buf, loc); | |
862 | if (match && equiv_class == match->pri.pri[0]) | |
863 | goto found; | |
864 | } | |
865 | /* no match */ | |
866 | if (ps) | |
867 | *ps = save; | |
868 | return 0; | |
869 | found: | |
870 | /* if we converted more than we used, restore to initial and reconvert | |
871 | * up to what did match */ | |
872 | if (i < len) { | |
873 | len = i; | |
874 | if (ps) | |
875 | *ps = save; | |
876 | if (start) | |
877 | i--; | |
878 | clen = 0; | |
879 | while(i-- > 0) { | |
880 | l = mbrtowc_l(&w, src, slen, ps, loc); | |
881 | src += l; | |
882 | clen += l; | |
883 | slen -= l; | |
884 | } | |
885 | } | |
886 | if (dst) { | |
887 | if (dlen < len) { | |
888 | if (ps) | |
889 | *ps = save; | |
890 | return (size_t)-1; | |
891 | } | |
892 | for(wp = buf; len > 0; len--) | |
893 | *dst++ = *wp++; | |
894 | } | |
895 | if (rlen) | |
896 | *rlen = clen; | |
897 | return len; | |
898 | } | |
899 | ||
900 | /* | |
901 | * __collate_equiv_value returns the primary collation value for the given | |
902 | * collating symbol specified by str and len. Zero or negative is return | |
903 | * if the collating symbol was not found. (Use by the bracket code in TRE.) | |
904 | */ | |
905 | __private_extern__ int | |
906 | __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len) | |
907 | { | |
908 | int e; | |
909 | ||
910 | if (len < 1 || len >= STR_LEN) | |
911 | return -1; | |
912 | ||
913 | /* POSIX locale */ | |
914 | if (loc->__collate_load_error) | |
915 | return (len == 1 && *str <= UCHAR_MAX) ? *str : -1; | |
916 | ||
917 | if (len == 1) { | |
918 | e = -1; | |
919 | if (*str <= UCHAR_MAX) | |
920 | e = __collate_char_pri_table[*str].pri[0]; | |
921 | else if (__collate_info->large_pri_count > 0) { | |
922 | struct __collate_st_large_char_pri *match; | |
923 | match = largesearch(*str, loc); | |
924 | if (match) | |
925 | e = match->pri.pri[0]; | |
926 | } | |
927 | if (e == 0) | |
928 | return IGNORE_EQUIV_CLASS; | |
929 | return e > 0 ? e : 0; | |
930 | } | |
931 | if (__collate_info->chain_count > 0) { | |
932 | wchar_t name[STR_LEN]; | |
933 | struct __collate_st_chain_pri *match; | |
934 | int ll; | |
935 | ||
936 | wcsncpy(name, str, len); | |
937 | name[len] = 0; | |
938 | match = chainsearch(name, &ll, loc); | |
939 | if (match) { | |
940 | e = match->pri[0]; | |
941 | if (e == 0) | |
942 | return IGNORE_EQUIV_CLASS; | |
943 | return e < 0 ? -e : e; | |
944 | } | |
945 | } | |
946 | return 0; | |
947 | } | |
948 | ||
949 | #if __DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN | |
950 | static void | |
951 | wntohl(wchar_t *str, int len) | |
952 | { | |
953 | for(; *str && len > 0; str++, len--) | |
954 | *str = ntohl(*str); | |
955 | } | |
956 | #endif /* __DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN */ | |
957 | ||
9385eb3d | 958 | #ifdef COLLATE_DEBUG |
ad3c9f2a A |
959 | static char * |
960 | show(int c) | |
961 | { | |
962 | static char buf[5]; | |
963 | ||
964 | if (c >=32 && c <= 126) | |
965 | sprintf(buf, "'%c' ", c); | |
966 | else | |
967 | sprintf(buf, "\\x{%02x}", c); | |
968 | return buf; | |
969 | } | |
970 | ||
971 | static char * | |
972 | showwcs(const wchar_t *t, int len) | |
973 | { | |
974 | static char buf[64]; | |
975 | char *cp = buf; | |
976 | ||
977 | for(; *t && len > 0; len--, t++) { | |
978 | if (*t >=32 && *t <= 126) | |
979 | *cp++ = *t; | |
980 | else { | |
981 | sprintf(cp, "\\x{%02x}", *t); | |
982 | cp += strlen(cp); | |
983 | } | |
984 | } | |
985 | *cp = 0; | |
986 | return buf; | |
987 | } | |
988 | ||
9385eb3d A |
989 | void |
990 | __collate_print_tables() | |
991 | { | |
ad3c9f2a A |
992 | int i, z; |
993 | locale_t loc = __current_locale(); | |
9385eb3d | 994 | |
ad3c9f2a A |
995 | printf("Info: p=%d s=%d f=0x%02x m=%d dc=%d up=%d us=%d pc=%d sc=%d cc=%d lc=%d\n", |
996 | __collate_info->directive[0], __collate_info->directive[1], | |
997 | __collate_info->flags, __collate_info->chain_max_len, | |
998 | __collate_info->directive_count, | |
999 | __collate_info->undef_pri[0], __collate_info->undef_pri[1], | |
1000 | __collate_info->subst_count[0], __collate_info->subst_count[1], | |
1001 | __collate_info->chain_count, __collate_info->large_pri_count); | |
1002 | for(z = 0; z < __collate_info->directive_count; z++) { | |
1003 | if (__collate_info->subst_count[z] > 0) { | |
1004 | struct __collate_st_subst *p2 = __collate_substitute_table[z]; | |
1005 | if (z == 0 && (__collate_info->flags & COLLATE_SUBST_DUP)) | |
1006 | printf("Both substitute tables:\n"); | |
1007 | else | |
1008 | printf("Substitute table %d:\n", z); | |
1009 | for (i = __collate_info->subst_count[z]; i-- > 0; p2++) | |
1010 | printf("\t%s --> \"%s\"\n", | |
1011 | show(p2->val), | |
1012 | showwcs(p2->str, STR_LEN)); | |
1013 | } | |
1014 | } | |
1015 | if (__collate_info->chain_count > 0) { | |
1016 | printf("Chain priority table:\n"); | |
1017 | struct __collate_st_chain_pri *p2 = __collate_chain_pri_table; | |
1018 | for (i = __collate_info->chain_count; i-- > 0; p2++) { | |
1019 | printf("\t\"%s\" :", showwcs(p2->str, STR_LEN)); | |
1020 | for(z = 0; z < __collate_info->directive_count; z++) | |
1021 | printf(" %d", p2->pri[z]); | |
1022 | putchar('\n'); | |
1023 | } | |
1024 | } | |
9385eb3d | 1025 | printf("Char priority table:\n"); |
ad3c9f2a A |
1026 | { |
1027 | struct __collate_st_char_pri *p2 = __collate_char_pri_table; | |
1028 | for (i = 0; i < UCHAR_MAX + 1; i++, p2++) { | |
1029 | printf("\t%s :", show(i)); | |
1030 | for(z = 0; z < __collate_info->directive_count; z++) | |
1031 | printf(" %d", p2->pri[z]); | |
1032 | putchar('\n'); | |
1033 | } | |
1034 | } | |
1035 | if (__collate_info->large_pri_count > 0) { | |
1036 | struct __collate_st_large_char_pri *p2 = __collate_large_char_pri_table; | |
1037 | printf("Large priority table:\n"); | |
1038 | for (i = __collate_info->large_pri_count; i-- > 0; p2++) { | |
1039 | printf("\t%s :", show(p2->val)); | |
1040 | for(z = 0; z < __collate_info->directive_count; z++) | |
1041 | printf(" %d", p2->pri.pri[z]); | |
1042 | putchar('\n'); | |
1043 | } | |
1044 | } | |
9385eb3d A |
1045 | } |
1046 | #endif |