]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/uinvchar.c
ICU-6.2.8.tar.gz
[apple/icu.git] / icuSources / common / uinvchar.c
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 1999-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: uinvchar.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:2
12 *
13 * created on: 2004sep14
14 * created by: Markus W. Scherer
15 *
16 * Functions for handling invariant characters, moved here from putil.c
17 * for better modularization.
18 */
19
20 #include "unicode/utypes.h"
21 #include "unicode/ustring.h"
22 #include "udataswp.h"
23 #include "cstring.h"
24 #include "cmemory.h"
25 #include "uassert.h"
26 #include "uinvchar.h"
27
28 /* invariant-character handling --------------------------------------------- */
29
30 /*
31 * These maps for ASCII to/from EBCDIC map invariant characters (see utypes.h)
32 * appropriately for most EBCDIC codepages.
33 *
34 * They currently also map most other ASCII graphic characters,
35 * appropriately for codepages 37 and 1047.
36 * Exceptions: The characters for []^ have different codes in 37 & 1047.
37 * Both versions are mapped to ASCII.
38 *
39 * ASCII 37 1047
40 * [ 5B BA AD
41 * ] 5D BB BD
42 * ^ 5E B0 5F
43 *
44 * There are no mappings for variant characters from Unicode to EBCDIC.
45 *
46 * Currently, C0 control codes are also included in these maps.
47 * Exceptions: S/390 Open Edition swaps LF and NEL codes compared with other
48 * EBCDIC platforms; both codes (15 and 25) are mapped to ASCII LF (0A),
49 * but there is no mapping for ASCII LF back to EBCDIC.
50 *
51 * ASCII EBCDIC S/390-OE
52 * LF 0A 25 15
53 * NEL 85 15 25
54 *
55 * The maps below explicitly exclude the variant
56 * control and graphical characters that are in ASCII-based
57 * codepages at 0x80 and above.
58 * "No mapping" is expressed by mapping to a 00 byte.
59 *
60 * These tables do not establish a converter or a codepage.
61 */
62
63 static const uint8_t asciiFromEbcdic[256]={
64 0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
65 0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f,
66 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07,
67 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a,
68
69 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
70 0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e,
71 0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
72 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
73
74 0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
75 0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
76 0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00,
77 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00,
78
79 0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
80 0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
81 0x5c, 0x00, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
82 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
83 };
84
85 static const uint8_t ebcdicFromAscii[256]={
86 0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
87 0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f,
88 0x40, 0x00, 0x7f, 0x00, 0x00, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61,
89 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f,
90
91 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6,
92 0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x6d,
93 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
94 0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0x00, 0x00, 0x00, 0x00, 0x07,
95
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
105 };
106
107 /*
108 * Bit sets indicating which characters of the ASCII repertoire
109 * (by ASCII/Unicode code) are "invariant".
110 * See utypes.h for more details.
111 *
112 * As invariant are considered the characters of the ASCII repertoire except
113 * for the following:
114 * 21 '!' <exclamation mark>
115 * 23 '#' <number sign>
116 * 24 '$' <dollar sign>
117 *
118 * 40 '@' <commercial at>
119 *
120 * 5b '[' <left bracket>
121 * 5c '\' <backslash>
122 * 5d ']' <right bracket>
123 * 5e '^' <circumflex>
124 *
125 * 60 '`' <grave accent>
126 *
127 * 7b '{' <left brace>
128 * 7c '|' <vertical line>
129 * 7d '}' <right brace>
130 * 7e '~' <tilde>
131 */
132 static const uint32_t invariantChars[4]={
133 0xfffffbff, /* 00..1f but not 0a */
134 0xffffffe5, /* 20..3f but not 21 23 24 */
135 0x87fffffe, /* 40..5f but not 40 5b..5e */
136 0x87fffffe /* 60..7f but not 60 7b..7e */
137 };
138
139 /*
140 * test unsigned types (or values known to be non-negative) for invariant characters,
141 * tests ASCII-family character values
142 */
143 #define UCHAR_IS_INVARIANT(c) (((c)<=0x7f) && (invariantChars[(c)>>5]&((uint32_t)1<<((c)&0x1f)))!=0)
144
145 /* test signed types for invariant characters, adds test for positive values */
146 #define SCHAR_IS_INVARIANT(c) ((0<=(c)) && UCHAR_IS_INVARIANT(c))
147
148 U_CAPI void U_EXPORT2
149 u_charsToUChars(const char *cs, UChar *us, int32_t length) {
150 UChar u;
151 uint8_t c;
152 UBool onlyInvariantChars;
153
154 /*
155 * Allow the entire ASCII repertoire to be mapped _to_ Unicode.
156 * For EBCDIC systems, this works for characters with codes from
157 * codepages 37 and 1047 or compatible.
158 */
159 onlyInvariantChars=TRUE;
160 while(length>0) {
161 c=(uint8_t)(*cs++);
162 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
163 u=(UChar)c;
164 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
165 u=(UChar)asciiFromEbcdic[c];
166 #else
167 # error U_CHARSET_FAMILY is not valid
168 #endif
169 if(u==0 && c!=0) {
170 onlyInvariantChars=FALSE;
171 }
172 *us++=u;
173 --length;
174 }
175 U_ASSERT(onlyInvariantChars); /* only invariant chars? */
176 }
177
178 U_CAPI void U_EXPORT2
179 u_UCharsToChars(const UChar *us, char *cs, int32_t length) {
180 UChar u;
181 UBool onlyInvariantChars;
182
183 onlyInvariantChars=TRUE;
184 while(length>0) {
185 u=*us++;
186 if(!UCHAR_IS_INVARIANT(u)) {
187 onlyInvariantChars=FALSE;
188 u=0;
189 }
190 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
191 *cs++=(char)u;
192 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
193 *cs++=(char)ebcdicFromAscii[u];
194 #else
195 # error U_CHARSET_FAMILY is not valid
196 #endif
197 --length;
198 }
199 U_ASSERT(onlyInvariantChars); /* only invariant chars? */
200 }
201
202 U_CAPI UBool U_EXPORT2
203 uprv_isInvariantString(const char *s, int32_t length) {
204 uint8_t c;
205
206 for(;;) {
207 if(length<0) {
208 /* NUL-terminated */
209 c=(uint8_t)*s++;
210 if(c==0) {
211 break;
212 }
213 } else {
214 /* count length */
215 if(length==0) {
216 break;
217 }
218 --length;
219 c=(uint8_t)*s++;
220 if(c==0) {
221 continue; /* NUL is invariant */
222 }
223 }
224 /* c!=0 now, one branch below checks c==0 for variant characters */
225
226 /*
227 * no assertions here because these functions are legitimately called
228 * for strings with variant characters
229 */
230 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
231 if(!UCHAR_IS_INVARIANT(c)) {
232 return FALSE; /* found a variant char */
233 }
234 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
235 c=asciiFromEbcdic[c];
236 if(c==0 || !UCHAR_IS_INVARIANT(c)) {
237 return FALSE; /* found a variant char */
238 }
239 #else
240 # error U_CHARSET_FAMILY is not valid
241 #endif
242 }
243 return TRUE;
244 }
245
246 U_CAPI UBool U_EXPORT2
247 uprv_isInvariantUString(const UChar *s, int32_t length) {
248 UChar c;
249
250 for(;;) {
251 if(length<0) {
252 /* NUL-terminated */
253 c=*s++;
254 if(c==0) {
255 break;
256 }
257 } else {
258 /* count length */
259 if(length==0) {
260 break;
261 }
262 --length;
263 c=*s++;
264 }
265
266 /*
267 * no assertions here because these functions are legitimately called
268 * for strings with variant characters
269 */
270 if(!UCHAR_IS_INVARIANT(c)) {
271 return FALSE; /* found a variant char */
272 }
273 }
274 return TRUE;
275 }
276
277 /* UDataSwapFn implementations used in udataswp.c ------- */
278
279 /* convert ASCII to EBCDIC and verify that all characters are invariant */
280 U_CFUNC int32_t
281 uprv_ebcdicFromAscii(const UDataSwapper *ds,
282 const void *inData, int32_t length, void *outData,
283 UErrorCode *pErrorCode) {
284 const uint8_t *s;
285 uint8_t *t;
286 uint8_t c;
287
288 int32_t count;
289
290 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
291 return 0;
292 }
293 if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
294 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
295 return 0;
296 }
297
298 /* setup and swapping */
299 s=(const uint8_t *)inData;
300 t=(uint8_t *)outData;
301 count=length;
302 while(count>0) {
303 c=*s++;
304 if(!UCHAR_IS_INVARIANT(c)) {
305 udata_printError(ds, "uprv_ebcdicFromAscii() string[%d] contains a variant character in position %d\n",
306 length, length-count);
307 *pErrorCode=U_INVALID_CHAR_FOUND;
308 return 0;
309 }
310 *t++=ebcdicFromAscii[c];
311 --count;
312 }
313
314 return length;
315 }
316
317 /* this function only checks and copies ASCII strings without conversion */
318 U_CFUNC int32_t
319 uprv_copyAscii(const UDataSwapper *ds,
320 const void *inData, int32_t length, void *outData,
321 UErrorCode *pErrorCode) {
322 const uint8_t *s;
323 uint8_t c;
324
325 int32_t count;
326
327 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
328 return 0;
329 }
330 if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
331 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
332 return 0;
333 }
334
335 /* setup and checking */
336 s=(const uint8_t *)inData;
337 count=length;
338 while(count>0) {
339 c=*s++;
340 if(!UCHAR_IS_INVARIANT(c)) {
341 udata_printError(ds, "uprv_copyFromAscii() string[%d] contains a variant character in position %d\n",
342 length, length-count);
343 *pErrorCode=U_INVALID_CHAR_FOUND;
344 return 0;
345 }
346 --count;
347 }
348
349 if(length>0 && inData!=outData) {
350 uprv_memcpy(outData, inData, length);
351 }
352
353 return length;
354 }
355
356 /* convert EBCDIC to ASCII and verify that all characters are invariant */
357 U_CFUNC int32_t
358 uprv_asciiFromEbcdic(const UDataSwapper *ds,
359 const void *inData, int32_t length, void *outData,
360 UErrorCode *pErrorCode) {
361 const uint8_t *s;
362 uint8_t *t;
363 uint8_t c;
364
365 int32_t count;
366
367 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
368 return 0;
369 }
370 if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
371 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
372 return 0;
373 }
374
375 /* setup and swapping */
376 s=(const uint8_t *)inData;
377 t=(uint8_t *)outData;
378 count=length;
379 while(count>0) {
380 c=*s++;
381 if(c!=0 && ((c=asciiFromEbcdic[c])==0 || !UCHAR_IS_INVARIANT(c))) {
382 udata_printError(ds, "uprv_asciiFromEbcdic() string[%d] contains a variant character in position %d\n",
383 length, length-count);
384 *pErrorCode=U_INVALID_CHAR_FOUND;
385 return 0;
386 }
387 *t++=c;
388 --count;
389 }
390
391 return length;
392 }
393
394 /* this function only checks and copies EBCDIC strings without conversion */
395 U_CFUNC int32_t
396 uprv_copyEbcdic(const UDataSwapper *ds,
397 const void *inData, int32_t length, void *outData,
398 UErrorCode *pErrorCode) {
399 const uint8_t *s;
400 uint8_t c;
401
402 int32_t count;
403
404 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
405 return 0;
406 }
407 if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
408 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
409 return 0;
410 }
411
412 /* setup and checking */
413 s=(const uint8_t *)inData;
414 count=length;
415 while(count>0) {
416 c=*s++;
417 if(c!=0 && ((c=asciiFromEbcdic[c])==0 || !UCHAR_IS_INVARIANT(c))) {
418 udata_printError(ds, "uprv_copyEbcdic() string[%] contains a variant character in position %d\n",
419 length, length-count);
420 *pErrorCode=U_INVALID_CHAR_FOUND;
421 return 0;
422 }
423 --count;
424 }
425
426 if(length>0 && inData!=outData) {
427 uprv_memcpy(outData, inData, length);
428 }
429
430 return length;
431 }
432
433 /* compare invariant strings; variant characters compare less than others and unlike each other */
434 U_CFUNC int32_t
435 uprv_compareInvAscii(const UDataSwapper *ds,
436 const char *outString, int32_t outLength,
437 const UChar *localString, int32_t localLength) {
438 int32_t minLength;
439 UChar32 c1, c2;
440 uint8_t c;
441
442 if(outString==NULL || outLength<-1 || localString==NULL || localLength<-1) {
443 return 0;
444 }
445
446 if(outLength<0) {
447 outLength=(int32_t)uprv_strlen(outString);
448 }
449 if(localLength<0) {
450 localLength=u_strlen(localString);
451 }
452
453 minLength= outLength<localLength ? outLength : localLength;
454
455 while(minLength>0) {
456 c=(uint8_t)*outString++;
457 if(UCHAR_IS_INVARIANT(c)) {
458 c1=c;
459 } else {
460 c1=-1;
461 }
462
463 c2=*localString++;
464 if(!UCHAR_IS_INVARIANT(c2)) {
465 c1=-2;
466 }
467
468 if((c1-=c2)!=0) {
469 return c1;
470 }
471
472 --minLength;
473 }
474
475 /* strings start with same prefix, compare lengths */
476 return outLength-localLength;
477 }
478
479 U_CFUNC int32_t
480 uprv_compareInvEbcdic(const UDataSwapper *ds,
481 const char *outString, int32_t outLength,
482 const UChar *localString, int32_t localLength) {
483 int32_t minLength;
484 UChar32 c1, c2;
485 uint8_t c;
486
487 if(outString==NULL || outLength<-1 || localString==NULL || localLength<-1) {
488 return 0;
489 }
490
491 if(outLength<0) {
492 outLength=(int32_t)uprv_strlen(outString);
493 }
494 if(localLength<0) {
495 localLength=u_strlen(localString);
496 }
497
498 minLength= outLength<localLength ? outLength : localLength;
499
500 while(minLength>0) {
501 c=(uint8_t)*outString++;
502 if(c==0) {
503 c1=0;
504 } else if((c1=asciiFromEbcdic[c])!=0 && UCHAR_IS_INVARIANT(c1)) {
505 /* c1 is set */
506 } else {
507 c1=-1;
508 }
509
510 c2=*localString++;
511 if(!UCHAR_IS_INVARIANT(c2)) {
512 c1=-2;
513 }
514
515 if((c1-=c2)!=0) {
516 return c1;
517 }
518
519 --minLength;
520 }
521
522 /* strings start with same prefix, compare lengths */
523 return outLength-localLength;
524 }