]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/uinvchar.c
2 *******************************************************************************
4 * Copyright (C) 1999-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: uinvchar.c
10 * tab size: 8 (not used)
13 * created on: 2004sep14
14 * created by: Markus W. Scherer
16 * Functions for handling invariant characters, moved here from putil.c
17 * for better modularization.
20 #include "unicode/utypes.h"
21 #include "unicode/ustring.h"
28 /* invariant-character handling --------------------------------------------- */
31 * These maps for ASCII to/from EBCDIC map invariant characters (see utypes.h)
32 * appropriately for most EBCDIC codepages.
34 * They currently also map most other ASCII graphic characters,
35 * appropriately for codepages 37 and 1047.
36 * Exceptions: The characters for []^ have different codes in 37 & 1047.
37 * Both versions are mapped to ASCII.
44 * There are no mappings for variant characters from Unicode to EBCDIC.
46 * Currently, C0 control codes are also included in these maps.
47 * Exceptions: S/390 Open Edition swaps LF and NEL codes compared with other
48 * EBCDIC platforms; both codes (15 and 25) are mapped to ASCII LF (0A),
49 * but there is no mapping for ASCII LF back to EBCDIC.
51 * ASCII EBCDIC S/390-OE
55 * The maps below explicitly exclude the variant
56 * control and graphical characters that are in ASCII-based
57 * codepages at 0x80 and above.
58 * "No mapping" is expressed by mapping to a 00 byte.
60 * These tables do not establish a converter or a codepage.
63 static const uint8_t asciiFromEbcdic
[256]={
64 0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
65 0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f,
66 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07,
67 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a,
69 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
70 0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e,
71 0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
72 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
74 0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
75 0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
76 0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00,
77 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00,
79 0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
80 0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
81 0x5c, 0x00, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
82 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
85 static const uint8_t ebcdicFromAscii
[256]={
86 0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
87 0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f,
88 0x40, 0x00, 0x7f, 0x00, 0x00, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61,
89 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f,
91 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6,
92 0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x6d,
93 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
94 0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0x00, 0x00, 0x00, 0x00, 0x07,
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
107 /* Same as asciiFromEbcdic[] except maps all letters to lowercase. */
108 static const uint8_t lowercaseAsciiFromEbcdic
[256]={
109 0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
110 0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f,
111 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07,
112 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a,
114 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
115 0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e,
116 0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
117 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
119 0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
120 0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
121 0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00,
122 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00,
124 0x7b, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
125 0x7d, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
126 0x7c, 0x00, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
127 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
131 * Bit sets indicating which characters of the ASCII repertoire
132 * (by ASCII/Unicode code) are "invariant".
133 * See utypes.h for more details.
135 * As invariant are considered the characters of the ASCII repertoire except
137 * 21 '!' <exclamation mark>
138 * 23 '#' <number sign>
139 * 24 '$' <dollar sign>
141 * 40 '@' <commercial at>
143 * 5b '[' <left bracket>
145 * 5d ']' <right bracket>
146 * 5e '^' <circumflex>
148 * 60 '`' <grave accent>
150 * 7b '{' <left brace>
151 * 7c '|' <vertical line>
152 * 7d '}' <right brace>
155 static const uint32_t invariantChars
[4]={
156 0xfffffbff, /* 00..1f but not 0a */
157 0xffffffe5, /* 20..3f but not 21 23 24 */
158 0x87fffffe, /* 40..5f but not 40 5b..5e */
159 0x87fffffe /* 60..7f but not 60 7b..7e */
163 * test unsigned types (or values known to be non-negative) for invariant characters,
164 * tests ASCII-family character values
166 #define UCHAR_IS_INVARIANT(c) (((c)<=0x7f) && (invariantChars[(c)>>5]&((uint32_t)1<<((c)&0x1f)))!=0)
168 /* test signed types for invariant characters, adds test for positive values */
169 #define SCHAR_IS_INVARIANT(c) ((0<=(c)) && UCHAR_IS_INVARIANT(c))
171 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
172 #define CHAR_TO_UCHAR(c) c
173 #define UCHAR_TO_CHAR(c) c
174 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
175 #define CHAR_TO_UCHAR(u) asciiFromEbcdic[u]
176 #define UCHAR_TO_CHAR(u) ebcdicFromAscii[u]
178 # error U_CHARSET_FAMILY is not valid
182 U_CAPI
void U_EXPORT2
183 u_charsToUChars(const char *cs
, UChar
*us
, int32_t length
) {
188 * Allow the entire ASCII repertoire to be mapped _to_ Unicode.
189 * For EBCDIC systems, this works for characters with codes from
190 * codepages 37 and 1047 or compatible.
194 u
=(UChar
)CHAR_TO_UCHAR(c
);
195 U_ASSERT((u
!=0 || c
==0)); /* only invariant chars converted? */
201 U_CAPI
void U_EXPORT2
202 u_UCharsToChars(const UChar
*us
, char *cs
, int32_t length
) {
207 if(!UCHAR_IS_INVARIANT(u
)) {
208 U_ASSERT(FALSE
); /* Variant characters were used. These are not portable in ICU. */
211 *cs
++=(char)UCHAR_TO_CHAR(u
);
216 U_CAPI UBool U_EXPORT2
217 uprv_isInvariantString(const char *s
, int32_t length
) {
235 continue; /* NUL is invariant */
238 /* c!=0 now, one branch below checks c==0 for variant characters */
241 * no assertions here because these functions are legitimately called
242 * for strings with variant characters
244 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
245 if(!UCHAR_IS_INVARIANT(c
)) {
246 return FALSE
; /* found a variant char */
248 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
250 if(c
==0 || !UCHAR_IS_INVARIANT(c
)) {
251 return FALSE
; /* found a variant char */
254 # error U_CHARSET_FAMILY is not valid
260 U_CAPI UBool U_EXPORT2
261 uprv_isInvariantUString(const UChar
*s
, int32_t length
) {
281 * no assertions here because these functions are legitimately called
282 * for strings with variant characters
284 if(!UCHAR_IS_INVARIANT(c
)) {
285 return FALSE
; /* found a variant char */
291 /* UDataSwapFn implementations used in udataswp.c ------- */
293 /* convert ASCII to EBCDIC and verify that all characters are invariant */
294 U_CAPI
int32_t U_EXPORT2
295 uprv_ebcdicFromAscii(const UDataSwapper
*ds
,
296 const void *inData
, int32_t length
, void *outData
,
297 UErrorCode
*pErrorCode
) {
304 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
307 if(ds
==NULL
|| inData
==NULL
|| length
<0 || (length
>0 && outData
==NULL
)) {
308 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
312 /* setup and swapping */
313 s
=(const uint8_t *)inData
;
314 t
=(uint8_t *)outData
;
318 if(!UCHAR_IS_INVARIANT(c
)) {
319 udata_printError(ds
, "uprv_ebcdicFromAscii() string[%d] contains a variant character in position %d\n",
320 length
, length
-count
);
321 *pErrorCode
=U_INVALID_CHAR_FOUND
;
324 *t
++=ebcdicFromAscii
[c
];
331 /* this function only checks and copies ASCII strings without conversion */
333 uprv_copyAscii(const UDataSwapper
*ds
,
334 const void *inData
, int32_t length
, void *outData
,
335 UErrorCode
*pErrorCode
) {
341 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
344 if(ds
==NULL
|| inData
==NULL
|| length
<0 || (length
>0 && outData
==NULL
)) {
345 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
349 /* setup and checking */
350 s
=(const uint8_t *)inData
;
354 if(!UCHAR_IS_INVARIANT(c
)) {
355 udata_printError(ds
, "uprv_copyFromAscii() string[%d] contains a variant character in position %d\n",
356 length
, length
-count
);
357 *pErrorCode
=U_INVALID_CHAR_FOUND
;
363 if(length
>0 && inData
!=outData
) {
364 uprv_memcpy(outData
, inData
, length
);
370 /* convert EBCDIC to ASCII and verify that all characters are invariant */
372 uprv_asciiFromEbcdic(const UDataSwapper
*ds
,
373 const void *inData
, int32_t length
, void *outData
,
374 UErrorCode
*pErrorCode
) {
381 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
384 if(ds
==NULL
|| inData
==NULL
|| length
<0 || (length
>0 && outData
==NULL
)) {
385 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
389 /* setup and swapping */
390 s
=(const uint8_t *)inData
;
391 t
=(uint8_t *)outData
;
395 if(c
!=0 && ((c
=asciiFromEbcdic
[c
])==0 || !UCHAR_IS_INVARIANT(c
))) {
396 udata_printError(ds
, "uprv_asciiFromEbcdic() string[%d] contains a variant character in position %d\n",
397 length
, length
-count
);
398 *pErrorCode
=U_INVALID_CHAR_FOUND
;
408 /* this function only checks and copies EBCDIC strings without conversion */
410 uprv_copyEbcdic(const UDataSwapper
*ds
,
411 const void *inData
, int32_t length
, void *outData
,
412 UErrorCode
*pErrorCode
) {
418 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
421 if(ds
==NULL
|| inData
==NULL
|| length
<0 || (length
>0 && outData
==NULL
)) {
422 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
426 /* setup and checking */
427 s
=(const uint8_t *)inData
;
431 if(c
!=0 && ((c
=asciiFromEbcdic
[c
])==0 || !UCHAR_IS_INVARIANT(c
))) {
432 udata_printError(ds
, "uprv_copyEbcdic() string[%] contains a variant character in position %d\n",
433 length
, length
-count
);
434 *pErrorCode
=U_INVALID_CHAR_FOUND
;
440 if(length
>0 && inData
!=outData
) {
441 uprv_memcpy(outData
, inData
, length
);
447 /* compare invariant strings; variant characters compare less than others and unlike each other */
449 uprv_compareInvAscii(const UDataSwapper
*ds
,
450 const char *outString
, int32_t outLength
,
451 const UChar
*localString
, int32_t localLength
) {
456 if(outString
==NULL
|| outLength
<-1 || localString
==NULL
|| localLength
<-1) {
461 outLength
=(int32_t)uprv_strlen(outString
);
464 localLength
=u_strlen(localString
);
467 minLength
= outLength
<localLength
? outLength
: localLength
;
470 c
=(uint8_t)*outString
++;
471 if(UCHAR_IS_INVARIANT(c
)) {
478 if(!UCHAR_IS_INVARIANT(c2
)) {
489 /* strings start with same prefix, compare lengths */
490 return outLength
-localLength
;
494 uprv_compareInvEbcdic(const UDataSwapper
*ds
,
495 const char *outString
, int32_t outLength
,
496 const UChar
*localString
, int32_t localLength
) {
501 if(outString
==NULL
|| outLength
<-1 || localString
==NULL
|| localLength
<-1) {
506 outLength
=(int32_t)uprv_strlen(outString
);
509 localLength
=u_strlen(localString
);
512 minLength
= outLength
<localLength
? outLength
: localLength
;
515 c
=(uint8_t)*outString
++;
518 } else if((c1
=asciiFromEbcdic
[c
])!=0 && UCHAR_IS_INVARIANT(c1
)) {
525 if(!UCHAR_IS_INVARIANT(c2
)) {
536 /* strings start with same prefix, compare lengths */
537 return outLength
-localLength
;
540 U_CAPI
int32_t U_EXPORT2
541 uprv_compareInvEbcdicAsAscii(const char *s1
, const char *s2
) {
548 if(c1
!=0 && ((c1
=asciiFromEbcdic
[c1
])==0 || !UCHAR_IS_INVARIANT(c1
))) {
549 c1
=-(int32_t)(uint8_t)*s1
;
551 if(c2
!=0 && ((c2
=asciiFromEbcdic
[c2
])==0 || !UCHAR_IS_INVARIANT(c2
))) {
552 c2
=-(int32_t)(uint8_t)*s2
;
561 U_CAPI
char U_EXPORT2
562 uprv_ebcdicToLowercaseAscii(char c
) {
563 return (char)lowercaseAsciiFromEbcdic
[(uint8_t)c
];
566 U_INTERNAL
uint8_t* U_EXPORT2
567 uprv_aestrncpy(uint8_t *dst
, const uint8_t *src
, int32_t n
)
569 uint8_t *orig_dst
= dst
;
572 n
= uprv_strlen((const char*)src
)+1; /* copy NUL */
576 *(dst
++) = asciiFromEbcdic
[*(src
++)];
587 U_INTERNAL
uint8_t* U_EXPORT2
588 uprv_eastrncpy(uint8_t *dst
, const uint8_t *src
, int32_t n
)
590 uint8_t *orig_dst
= dst
;
593 n
= uprv_strlen((const char*)src
)+1; /* copy NUL */
597 char ch
= ebcdicFromAscii
[*(src
++)];
599 ch
= ebcdicFromAscii
[0x3f]; /* questionmark (subchar) */