]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/uinvchar.c
2 *******************************************************************************
4 * Copyright (C) 1999-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: uinvchar.c
10 * tab size: 8 (not used)
13 * created on: 2004sep14
14 * created by: Markus W. Scherer
16 * Functions for handling invariant characters, moved here from putil.c
17 * for better modularization.
20 #include "unicode/utypes.h"
21 #include "unicode/ustring.h"
28 /* invariant-character handling --------------------------------------------- */
31 * These maps for ASCII to/from EBCDIC map invariant characters (see utypes.h)
32 * appropriately for most EBCDIC codepages.
34 * They currently also map most other ASCII graphic characters,
35 * appropriately for codepages 37 and 1047.
36 * Exceptions: The characters for []^ have different codes in 37 & 1047.
37 * Both versions are mapped to ASCII.
44 * There are no mappings for variant characters from Unicode to EBCDIC.
46 * Currently, C0 control codes are also included in these maps.
47 * Exceptions: S/390 Open Edition swaps LF and NEL codes compared with other
48 * EBCDIC platforms; both codes (15 and 25) are mapped to ASCII LF (0A),
49 * but there is no mapping for ASCII LF back to EBCDIC.
51 * ASCII EBCDIC S/390-OE
55 * The maps below explicitly exclude the variant
56 * control and graphical characters that are in ASCII-based
57 * codepages at 0x80 and above.
58 * "No mapping" is expressed by mapping to a 00 byte.
60 * These tables do not establish a converter or a codepage.
63 static const uint8_t asciiFromEbcdic
[256]={
64 0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
65 0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f,
66 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07,
67 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a,
69 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
70 0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e,
71 0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
72 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
74 0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
75 0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
76 0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00,
77 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00,
79 0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
80 0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
81 0x5c, 0x00, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
82 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
85 static const uint8_t ebcdicFromAscii
[256]={
86 0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
87 0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f,
88 0x40, 0x00, 0x7f, 0x00, 0x00, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61,
89 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f,
91 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6,
92 0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x6d,
93 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
94 0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0x00, 0x00, 0x00, 0x00, 0x07,
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
108 * Bit sets indicating which characters of the ASCII repertoire
109 * (by ASCII/Unicode code) are "invariant".
110 * See utypes.h for more details.
112 * As invariant are considered the characters of the ASCII repertoire except
114 * 21 '!' <exclamation mark>
115 * 23 '#' <number sign>
116 * 24 '$' <dollar sign>
118 * 40 '@' <commercial at>
120 * 5b '[' <left bracket>
122 * 5d ']' <right bracket>
123 * 5e '^' <circumflex>
125 * 60 '`' <grave accent>
127 * 7b '{' <left brace>
128 * 7c '|' <vertical line>
129 * 7d '}' <right brace>
132 static const uint32_t invariantChars
[4]={
133 0xfffffbff, /* 00..1f but not 0a */
134 0xffffffe5, /* 20..3f but not 21 23 24 */
135 0x87fffffe, /* 40..5f but not 40 5b..5e */
136 0x87fffffe /* 60..7f but not 60 7b..7e */
140 * test unsigned types (or values known to be non-negative) for invariant characters,
141 * tests ASCII-family character values
143 #define UCHAR_IS_INVARIANT(c) (((c)<=0x7f) && (invariantChars[(c)>>5]&((uint32_t)1<<((c)&0x1f)))!=0)
145 /* test signed types for invariant characters, adds test for positive values */
146 #define SCHAR_IS_INVARIANT(c) ((0<=(c)) && UCHAR_IS_INVARIANT(c))
148 U_CAPI
void U_EXPORT2
149 u_charsToUChars(const char *cs
, UChar
*us
, int32_t length
) {
152 UBool onlyInvariantChars
;
155 * Allow the entire ASCII repertoire to be mapped _to_ Unicode.
156 * For EBCDIC systems, this works for characters with codes from
157 * codepages 37 and 1047 or compatible.
159 onlyInvariantChars
=TRUE
;
162 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
164 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
165 u
=(UChar
)asciiFromEbcdic
[c
];
167 # error U_CHARSET_FAMILY is not valid
170 onlyInvariantChars
=FALSE
;
175 U_ASSERT(onlyInvariantChars
); /* only invariant chars? */
178 U_CAPI
void U_EXPORT2
179 u_UCharsToChars(const UChar
*us
, char *cs
, int32_t length
) {
181 UBool onlyInvariantChars
;
183 onlyInvariantChars
=TRUE
;
186 if(!UCHAR_IS_INVARIANT(u
)) {
187 onlyInvariantChars
=FALSE
;
190 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
192 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
193 *cs
++=(char)ebcdicFromAscii
[u
];
195 # error U_CHARSET_FAMILY is not valid
199 U_ASSERT(onlyInvariantChars
); /* only invariant chars? */
202 U_CAPI UBool U_EXPORT2
203 uprv_isInvariantString(const char *s
, int32_t length
) {
221 continue; /* NUL is invariant */
224 /* c!=0 now, one branch below checks c==0 for variant characters */
227 * no assertions here because these functions are legitimately called
228 * for strings with variant characters
230 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
231 if(!UCHAR_IS_INVARIANT(c
)) {
232 return FALSE
; /* found a variant char */
234 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
235 c
=asciiFromEbcdic
[c
];
236 if(c
==0 || !UCHAR_IS_INVARIANT(c
)) {
237 return FALSE
; /* found a variant char */
240 # error U_CHARSET_FAMILY is not valid
246 U_CAPI UBool U_EXPORT2
247 uprv_isInvariantUString(const UChar
*s
, int32_t length
) {
267 * no assertions here because these functions are legitimately called
268 * for strings with variant characters
270 if(!UCHAR_IS_INVARIANT(c
)) {
271 return FALSE
; /* found a variant char */
277 /* UDataSwapFn implementations used in udataswp.c ------- */
279 /* convert ASCII to EBCDIC and verify that all characters are invariant */
281 uprv_ebcdicFromAscii(const UDataSwapper
*ds
,
282 const void *inData
, int32_t length
, void *outData
,
283 UErrorCode
*pErrorCode
) {
290 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
293 if(ds
==NULL
|| inData
==NULL
|| length
<0 || (length
>0 && outData
==NULL
)) {
294 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
298 /* setup and swapping */
299 s
=(const uint8_t *)inData
;
300 t
=(uint8_t *)outData
;
304 if(!UCHAR_IS_INVARIANT(c
)) {
305 udata_printError(ds
, "uprv_ebcdicFromAscii() string[%d] contains a variant character in position %d\n",
306 length
, length
-count
);
307 *pErrorCode
=U_INVALID_CHAR_FOUND
;
310 *t
++=ebcdicFromAscii
[c
];
317 /* this function only checks and copies ASCII strings without conversion */
319 uprv_copyAscii(const UDataSwapper
*ds
,
320 const void *inData
, int32_t length
, void *outData
,
321 UErrorCode
*pErrorCode
) {
327 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
330 if(ds
==NULL
|| inData
==NULL
|| length
<0 || (length
>0 && outData
==NULL
)) {
331 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
335 /* setup and checking */
336 s
=(const uint8_t *)inData
;
340 if(!UCHAR_IS_INVARIANT(c
)) {
341 udata_printError(ds
, "uprv_copyFromAscii() string[%d] contains a variant character in position %d\n",
342 length
, length
-count
);
343 *pErrorCode
=U_INVALID_CHAR_FOUND
;
349 if(length
>0 && inData
!=outData
) {
350 uprv_memcpy(outData
, inData
, length
);
356 /* convert EBCDIC to ASCII and verify that all characters are invariant */
358 uprv_asciiFromEbcdic(const UDataSwapper
*ds
,
359 const void *inData
, int32_t length
, void *outData
,
360 UErrorCode
*pErrorCode
) {
367 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
370 if(ds
==NULL
|| inData
==NULL
|| length
<0 || (length
>0 && outData
==NULL
)) {
371 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
375 /* setup and swapping */
376 s
=(const uint8_t *)inData
;
377 t
=(uint8_t *)outData
;
381 if(c
!=0 && ((c
=asciiFromEbcdic
[c
])==0 || !UCHAR_IS_INVARIANT(c
))) {
382 udata_printError(ds
, "uprv_asciiFromEbcdic() string[%d] contains a variant character in position %d\n",
383 length
, length
-count
);
384 *pErrorCode
=U_INVALID_CHAR_FOUND
;
394 /* this function only checks and copies EBCDIC strings without conversion */
396 uprv_copyEbcdic(const UDataSwapper
*ds
,
397 const void *inData
, int32_t length
, void *outData
,
398 UErrorCode
*pErrorCode
) {
404 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
407 if(ds
==NULL
|| inData
==NULL
|| length
<0 || (length
>0 && outData
==NULL
)) {
408 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
412 /* setup and checking */
413 s
=(const uint8_t *)inData
;
417 if(c
!=0 && ((c
=asciiFromEbcdic
[c
])==0 || !UCHAR_IS_INVARIANT(c
))) {
418 udata_printError(ds
, "uprv_copyEbcdic() string[%] contains a variant character in position %d\n",
419 length
, length
-count
);
420 *pErrorCode
=U_INVALID_CHAR_FOUND
;
426 if(length
>0 && inData
!=outData
) {
427 uprv_memcpy(outData
, inData
, length
);
433 /* compare invariant strings; variant characters compare less than others and unlike each other */
435 uprv_compareInvAscii(const UDataSwapper
*ds
,
436 const char *outString
, int32_t outLength
,
437 const UChar
*localString
, int32_t localLength
) {
442 if(outString
==NULL
|| outLength
<-1 || localString
==NULL
|| localLength
<-1) {
447 outLength
=(int32_t)uprv_strlen(outString
);
450 localLength
=u_strlen(localString
);
453 minLength
= outLength
<localLength
? outLength
: localLength
;
456 c
=(uint8_t)*outString
++;
457 if(UCHAR_IS_INVARIANT(c
)) {
464 if(!UCHAR_IS_INVARIANT(c2
)) {
475 /* strings start with same prefix, compare lengths */
476 return outLength
-localLength
;
480 uprv_compareInvEbcdic(const UDataSwapper
*ds
,
481 const char *outString
, int32_t outLength
,
482 const UChar
*localString
, int32_t localLength
) {
487 if(outString
==NULL
|| outLength
<-1 || localString
==NULL
|| localLength
<-1) {
492 outLength
=(int32_t)uprv_strlen(outString
);
495 localLength
=u_strlen(localString
);
498 minLength
= outLength
<localLength
? outLength
: localLength
;
501 c
=(uint8_t)*outString
++;
504 } else if((c1
=asciiFromEbcdic
[c
])!=0 && UCHAR_IS_INVARIANT(c1
)) {
511 if(!UCHAR_IS_INVARIANT(c2
)) {
522 /* strings start with same prefix, compare lengths */
523 return outLength
-localLength
;