]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/uinvchar.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 1999-2010, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: uinvchar.c
12 * tab size: 8 (not used)
15 * created on: 2004sep14
16 * created by: Markus W. Scherer
18 * Functions for handling invariant characters, moved here from putil.c
19 * for better modularization.
22 #include "unicode/utypes.h"
23 #include "unicode/ustring.h"
30 /* invariant-character handling --------------------------------------------- */
33 * These maps for ASCII to/from EBCDIC map invariant characters (see utypes.h)
34 * appropriately for most EBCDIC codepages.
36 * They currently also map most other ASCII graphic characters,
37 * appropriately for codepages 37 and 1047.
38 * Exceptions: The characters for []^ have different codes in 37 & 1047.
39 * Both versions are mapped to ASCII.
46 * There are no mappings for variant characters from Unicode to EBCDIC.
48 * Currently, C0 control codes are also included in these maps.
49 * Exceptions: S/390 Open Edition swaps LF and NEL codes compared with other
50 * EBCDIC platforms; both codes (15 and 25) are mapped to ASCII LF (0A),
51 * but there is no mapping for ASCII LF back to EBCDIC.
53 * ASCII EBCDIC S/390-OE
57 * The maps below explicitly exclude the variant
58 * control and graphical characters that are in ASCII-based
59 * codepages at 0x80 and above.
60 * "No mapping" is expressed by mapping to a 00 byte.
62 * These tables do not establish a converter or a codepage.
65 static const uint8_t asciiFromEbcdic
[256]={
66 0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
67 0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f,
68 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07,
69 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a,
71 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
72 0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e,
73 0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
74 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
76 0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
77 0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
78 0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00,
79 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00,
81 0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
82 0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
83 0x5c, 0x00, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
84 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
87 static const uint8_t ebcdicFromAscii
[256]={
88 0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
89 0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f,
90 0x40, 0x00, 0x7f, 0x00, 0x00, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61,
91 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f,
93 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6,
94 0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x6d,
95 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
96 0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0x00, 0x00, 0x00, 0x00, 0x07,
98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
109 /* Same as asciiFromEbcdic[] except maps all letters to lowercase. */
110 static const uint8_t lowercaseAsciiFromEbcdic
[256]={
111 0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
112 0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f,
113 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07,
114 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a,
116 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
117 0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e,
118 0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
119 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
121 0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
122 0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
123 0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00,
124 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00,
126 0x7b, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
127 0x7d, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
128 0x7c, 0x00, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
129 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
133 * Bit sets indicating which characters of the ASCII repertoire
134 * (by ASCII/Unicode code) are "invariant".
135 * See utypes.h for more details.
137 * As invariant are considered the characters of the ASCII repertoire except
139 * 21 '!' <exclamation mark>
140 * 23 '#' <number sign>
141 * 24 '$' <dollar sign>
143 * 40 '@' <commercial at>
145 * 5b '[' <left bracket>
147 * 5d ']' <right bracket>
148 * 5e '^' <circumflex>
150 * 60 '`' <grave accent>
152 * 7b '{' <left brace>
153 * 7c '|' <vertical line>
154 * 7d '}' <right brace>
157 static const uint32_t invariantChars
[4]={
158 0xfffffbff, /* 00..1f but not 0a */
159 0xffffffe5, /* 20..3f but not 21 23 24 */
160 0x87fffffe, /* 40..5f but not 40 5b..5e */
161 0x87fffffe /* 60..7f but not 60 7b..7e */
165 * test unsigned types (or values known to be non-negative) for invariant characters,
166 * tests ASCII-family character values
168 #define UCHAR_IS_INVARIANT(c) (((c)<=0x7f) && (invariantChars[(c)>>5]&((uint32_t)1<<((c)&0x1f)))!=0)
170 /* test signed types for invariant characters, adds test for positive values */
171 #define SCHAR_IS_INVARIANT(c) ((0<=(c)) && UCHAR_IS_INVARIANT(c))
173 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
174 #define CHAR_TO_UCHAR(c) c
175 #define UCHAR_TO_CHAR(c) c
176 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
177 #define CHAR_TO_UCHAR(u) asciiFromEbcdic[u]
178 #define UCHAR_TO_CHAR(u) ebcdicFromAscii[u]
180 # error U_CHARSET_FAMILY is not valid
184 U_CAPI
void U_EXPORT2
185 u_charsToUChars(const char *cs
, UChar
*us
, int32_t length
) {
190 * Allow the entire ASCII repertoire to be mapped _to_ Unicode.
191 * For EBCDIC systems, this works for characters with codes from
192 * codepages 37 and 1047 or compatible.
196 u
=(UChar
)CHAR_TO_UCHAR(c
);
197 U_ASSERT((u
!=0 || c
==0)); /* only invariant chars converted? */
203 U_CAPI
void U_EXPORT2
204 u_UCharsToChars(const UChar
*us
, char *cs
, int32_t length
) {
209 if(!UCHAR_IS_INVARIANT(u
)) {
210 U_ASSERT(FALSE
); /* Variant characters were used. These are not portable in ICU. */
213 *cs
++=(char)UCHAR_TO_CHAR(u
);
218 U_CAPI UBool U_EXPORT2
219 uprv_isInvariantString(const char *s
, int32_t length
) {
237 continue; /* NUL is invariant */
240 /* c!=0 now, one branch below checks c==0 for variant characters */
243 * no assertions here because these functions are legitimately called
244 * for strings with variant characters
246 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
247 if(!UCHAR_IS_INVARIANT(c
)) {
248 return FALSE
; /* found a variant char */
250 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
252 if(c
==0 || !UCHAR_IS_INVARIANT(c
)) {
253 return FALSE
; /* found a variant char */
256 # error U_CHARSET_FAMILY is not valid
262 U_CAPI UBool U_EXPORT2
263 uprv_isInvariantUString(const UChar
*s
, int32_t length
) {
283 * no assertions here because these functions are legitimately called
284 * for strings with variant characters
286 if(!UCHAR_IS_INVARIANT(c
)) {
287 return FALSE
; /* found a variant char */
293 /* UDataSwapFn implementations used in udataswp.c ------- */
295 /* convert ASCII to EBCDIC and verify that all characters are invariant */
296 U_CAPI
int32_t U_EXPORT2
297 uprv_ebcdicFromAscii(const UDataSwapper
*ds
,
298 const void *inData
, int32_t length
, void *outData
,
299 UErrorCode
*pErrorCode
) {
306 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
309 if(ds
==NULL
|| inData
==NULL
|| length
<0 || (length
>0 && outData
==NULL
)) {
310 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
314 /* setup and swapping */
315 s
=(const uint8_t *)inData
;
316 t
=(uint8_t *)outData
;
320 if(!UCHAR_IS_INVARIANT(c
)) {
321 udata_printError(ds
, "uprv_ebcdicFromAscii() string[%d] contains a variant character in position %d\n",
322 length
, length
-count
);
323 *pErrorCode
=U_INVALID_CHAR_FOUND
;
326 *t
++=ebcdicFromAscii
[c
];
333 /* this function only checks and copies ASCII strings without conversion */
335 uprv_copyAscii(const UDataSwapper
*ds
,
336 const void *inData
, int32_t length
, void *outData
,
337 UErrorCode
*pErrorCode
) {
343 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
346 if(ds
==NULL
|| inData
==NULL
|| length
<0 || (length
>0 && outData
==NULL
)) {
347 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
351 /* setup and checking */
352 s
=(const uint8_t *)inData
;
356 if(!UCHAR_IS_INVARIANT(c
)) {
357 udata_printError(ds
, "uprv_copyFromAscii() string[%d] contains a variant character in position %d\n",
358 length
, length
-count
);
359 *pErrorCode
=U_INVALID_CHAR_FOUND
;
365 if(length
>0 && inData
!=outData
) {
366 uprv_memcpy(outData
, inData
, length
);
372 /* convert EBCDIC to ASCII and verify that all characters are invariant */
374 uprv_asciiFromEbcdic(const UDataSwapper
*ds
,
375 const void *inData
, int32_t length
, void *outData
,
376 UErrorCode
*pErrorCode
) {
383 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
386 if(ds
==NULL
|| inData
==NULL
|| length
<0 || (length
>0 && outData
==NULL
)) {
387 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
391 /* setup and swapping */
392 s
=(const uint8_t *)inData
;
393 t
=(uint8_t *)outData
;
397 if(c
!=0 && ((c
=asciiFromEbcdic
[c
])==0 || !UCHAR_IS_INVARIANT(c
))) {
398 udata_printError(ds
, "uprv_asciiFromEbcdic() string[%d] contains a variant character in position %d\n",
399 length
, length
-count
);
400 *pErrorCode
=U_INVALID_CHAR_FOUND
;
410 /* this function only checks and copies EBCDIC strings without conversion */
412 uprv_copyEbcdic(const UDataSwapper
*ds
,
413 const void *inData
, int32_t length
, void *outData
,
414 UErrorCode
*pErrorCode
) {
420 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
423 if(ds
==NULL
|| inData
==NULL
|| length
<0 || (length
>0 && outData
==NULL
)) {
424 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
428 /* setup and checking */
429 s
=(const uint8_t *)inData
;
433 if(c
!=0 && ((c
=asciiFromEbcdic
[c
])==0 || !UCHAR_IS_INVARIANT(c
))) {
434 udata_printError(ds
, "uprv_copyEbcdic() string[%] contains a variant character in position %d\n",
435 length
, length
-count
);
436 *pErrorCode
=U_INVALID_CHAR_FOUND
;
442 if(length
>0 && inData
!=outData
) {
443 uprv_memcpy(outData
, inData
, length
);
449 /* compare invariant strings; variant characters compare less than others and unlike each other */
451 uprv_compareInvAscii(const UDataSwapper
*ds
,
452 const char *outString
, int32_t outLength
,
453 const UChar
*localString
, int32_t localLength
) {
459 if(outString
==NULL
|| outLength
<-1 || localString
==NULL
|| localLength
<-1) {
464 outLength
=(int32_t)uprv_strlen(outString
);
467 localLength
=u_strlen(localString
);
470 minLength
= outLength
<localLength
? outLength
: localLength
;
473 c
=(uint8_t)*outString
++;
474 if(UCHAR_IS_INVARIANT(c
)) {
481 if(!UCHAR_IS_INVARIANT(c2
)) {
492 /* strings start with same prefix, compare lengths */
493 return outLength
-localLength
;
497 uprv_compareInvEbcdic(const UDataSwapper
*ds
,
498 const char *outString
, int32_t outLength
,
499 const UChar
*localString
, int32_t localLength
) {
505 if(outString
==NULL
|| outLength
<-1 || localString
==NULL
|| localLength
<-1) {
510 outLength
=(int32_t)uprv_strlen(outString
);
513 localLength
=u_strlen(localString
);
516 minLength
= outLength
<localLength
? outLength
: localLength
;
519 c
=(uint8_t)*outString
++;
522 } else if((c1
=asciiFromEbcdic
[c
])!=0 && UCHAR_IS_INVARIANT(c1
)) {
529 if(!UCHAR_IS_INVARIANT(c2
)) {
540 /* strings start with same prefix, compare lengths */
541 return outLength
-localLength
;
544 U_CAPI
int32_t U_EXPORT2
545 uprv_compareInvEbcdicAsAscii(const char *s1
, const char *s2
) {
552 if(c1
!=0 && ((c1
=asciiFromEbcdic
[c1
])==0 || !UCHAR_IS_INVARIANT(c1
))) {
553 c1
=-(int32_t)(uint8_t)*s1
;
555 if(c2
!=0 && ((c2
=asciiFromEbcdic
[c2
])==0 || !UCHAR_IS_INVARIANT(c2
))) {
556 c2
=-(int32_t)(uint8_t)*s2
;
565 U_CAPI
char U_EXPORT2
566 uprv_ebcdicToLowercaseAscii(char c
) {
567 return (char)lowercaseAsciiFromEbcdic
[(uint8_t)c
];
570 U_INTERNAL
uint8_t* U_EXPORT2
571 uprv_aestrncpy(uint8_t *dst
, const uint8_t *src
, int32_t n
)
573 uint8_t *orig_dst
= dst
;
576 n
= uprv_strlen((const char*)src
)+1; /* copy NUL */
580 *(dst
++) = asciiFromEbcdic
[*(src
++)];
591 U_INTERNAL
uint8_t* U_EXPORT2
592 uprv_eastrncpy(uint8_t *dst
, const uint8_t *src
, int32_t n
)
594 uint8_t *orig_dst
= dst
;
597 n
= uprv_strlen((const char*)src
)+1; /* copy NUL */
601 char ch
= ebcdicFromAscii
[*(src
++)];
603 ch
= ebcdicFromAscii
[0x3f]; /* questionmark (subchar) */