]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/uinvchar.c
2 *******************************************************************************
4 * Copyright (C) 1999-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: uinvchar.c
10 * tab size: 8 (not used)
13 * created on: 2004sep14
14 * created by: Markus W. Scherer
16 * Functions for handling invariant characters, moved here from putil.c
17 * for better modularization.
20 #include "unicode/utypes.h"
21 #include "unicode/ustring.h"
28 /* invariant-character handling --------------------------------------------- */
31 * These maps for ASCII to/from EBCDIC map invariant characters (see utypes.h)
32 * appropriately for most EBCDIC codepages.
34 * They currently also map most other ASCII graphic characters,
35 * appropriately for codepages 37 and 1047.
36 * Exceptions: The characters for []^ have different codes in 37 & 1047.
37 * Both versions are mapped to ASCII.
44 * There are no mappings for variant characters from Unicode to EBCDIC.
46 * Currently, C0 control codes are also included in these maps.
47 * Exceptions: S/390 Open Edition swaps LF and NEL codes compared with other
48 * EBCDIC platforms; both codes (15 and 25) are mapped to ASCII LF (0A),
49 * but there is no mapping for ASCII LF back to EBCDIC.
51 * ASCII EBCDIC S/390-OE
55 * The maps below explicitly exclude the variant
56 * control and graphical characters that are in ASCII-based
57 * codepages at 0x80 and above.
58 * "No mapping" is expressed by mapping to a 00 byte.
60 * These tables do not establish a converter or a codepage.
63 static const uint8_t asciiFromEbcdic
[256]={
64 0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
65 0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f,
66 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07,
67 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a,
69 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
70 0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e,
71 0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
72 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
74 0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
75 0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
76 0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00,
77 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00,
79 0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
80 0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
81 0x5c, 0x00, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
82 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
85 static const uint8_t ebcdicFromAscii
[256]={
86 0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
87 0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f,
88 0x40, 0x00, 0x7f, 0x00, 0x00, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61,
89 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f,
91 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6,
92 0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x6d,
93 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
94 0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0x00, 0x00, 0x00, 0x00, 0x07,
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
108 * Bit sets indicating which characters of the ASCII repertoire
109 * (by ASCII/Unicode code) are "invariant".
110 * See utypes.h for more details.
112 * As invariant are considered the characters of the ASCII repertoire except
114 * 21 '!' <exclamation mark>
115 * 23 '#' <number sign>
116 * 24 '$' <dollar sign>
118 * 40 '@' <commercial at>
120 * 5b '[' <left bracket>
122 * 5d ']' <right bracket>
123 * 5e '^' <circumflex>
125 * 60 '`' <grave accent>
127 * 7b '{' <left brace>
128 * 7c '|' <vertical line>
129 * 7d '}' <right brace>
132 static const uint32_t invariantChars
[4]={
133 0xfffffbff, /* 00..1f but not 0a */
134 0xffffffe5, /* 20..3f but not 21 23 24 */
135 0x87fffffe, /* 40..5f but not 40 5b..5e */
136 0x87fffffe /* 60..7f but not 60 7b..7e */
140 * test unsigned types (or values known to be non-negative) for invariant characters,
141 * tests ASCII-family character values
143 #define UCHAR_IS_INVARIANT(c) (((c)<=0x7f) && (invariantChars[(c)>>5]&((uint32_t)1<<((c)&0x1f)))!=0)
145 /* test signed types for invariant characters, adds test for positive values */
146 #define SCHAR_IS_INVARIANT(c) ((0<=(c)) && UCHAR_IS_INVARIANT(c))
148 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
149 #define CHAR_TO_UCHAR(c) c
150 #define UCHAR_TO_CHAR(c) c
151 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
152 #define CHAR_TO_UCHAR(u) asciiFromEbcdic[u]
153 #define UCHAR_TO_CHAR(u) ebcdicFromAscii[u]
155 # error U_CHARSET_FAMILY is not valid
159 U_CAPI
void U_EXPORT2
160 u_charsToUChars(const char *cs
, UChar
*us
, int32_t length
) {
165 * Allow the entire ASCII repertoire to be mapped _to_ Unicode.
166 * For EBCDIC systems, this works for characters with codes from
167 * codepages 37 and 1047 or compatible.
171 u
=(UChar
)CHAR_TO_UCHAR(c
);
172 U_ASSERT((u
!=0 || c
==0)); /* only invariant chars converted? */
178 U_CAPI
void U_EXPORT2
179 u_UCharsToChars(const UChar
*us
, char *cs
, int32_t length
) {
184 if(!UCHAR_IS_INVARIANT(u
)) {
185 U_ASSERT(FALSE
); /* Variant characters were used. These are not portable in ICU. */
188 *cs
++=(char)UCHAR_TO_CHAR(u
);
193 U_CAPI UBool U_EXPORT2
194 uprv_isInvariantString(const char *s
, int32_t length
) {
212 continue; /* NUL is invariant */
215 /* c!=0 now, one branch below checks c==0 for variant characters */
218 * no assertions here because these functions are legitimately called
219 * for strings with variant characters
221 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
222 if(!UCHAR_IS_INVARIANT(c
)) {
223 return FALSE
; /* found a variant char */
225 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
227 if(c
==0 || !UCHAR_IS_INVARIANT(c
)) {
228 return FALSE
; /* found a variant char */
231 # error U_CHARSET_FAMILY is not valid
237 U_CAPI UBool U_EXPORT2
238 uprv_isInvariantUString(const UChar
*s
, int32_t length
) {
258 * no assertions here because these functions are legitimately called
259 * for strings with variant characters
261 if(!UCHAR_IS_INVARIANT(c
)) {
262 return FALSE
; /* found a variant char */
268 /* UDataSwapFn implementations used in udataswp.c ------- */
270 /* convert ASCII to EBCDIC and verify that all characters are invariant */
271 U_CAPI
int32_t U_EXPORT2
272 uprv_ebcdicFromAscii(const UDataSwapper
*ds
,
273 const void *inData
, int32_t length
, void *outData
,
274 UErrorCode
*pErrorCode
) {
281 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
284 if(ds
==NULL
|| inData
==NULL
|| length
<0 || (length
>0 && outData
==NULL
)) {
285 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
289 /* setup and swapping */
290 s
=(const uint8_t *)inData
;
291 t
=(uint8_t *)outData
;
295 if(!UCHAR_IS_INVARIANT(c
)) {
296 udata_printError(ds
, "uprv_ebcdicFromAscii() string[%d] contains a variant character in position %d\n",
297 length
, length
-count
);
298 *pErrorCode
=U_INVALID_CHAR_FOUND
;
301 *t
++=ebcdicFromAscii
[c
];
308 /* this function only checks and copies ASCII strings without conversion */
310 uprv_copyAscii(const UDataSwapper
*ds
,
311 const void *inData
, int32_t length
, void *outData
,
312 UErrorCode
*pErrorCode
) {
318 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
321 if(ds
==NULL
|| inData
==NULL
|| length
<0 || (length
>0 && outData
==NULL
)) {
322 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
326 /* setup and checking */
327 s
=(const uint8_t *)inData
;
331 if(!UCHAR_IS_INVARIANT(c
)) {
332 udata_printError(ds
, "uprv_copyFromAscii() string[%d] contains a variant character in position %d\n",
333 length
, length
-count
);
334 *pErrorCode
=U_INVALID_CHAR_FOUND
;
340 if(length
>0 && inData
!=outData
) {
341 uprv_memcpy(outData
, inData
, length
);
347 /* convert EBCDIC to ASCII and verify that all characters are invariant */
349 uprv_asciiFromEbcdic(const UDataSwapper
*ds
,
350 const void *inData
, int32_t length
, void *outData
,
351 UErrorCode
*pErrorCode
) {
358 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
361 if(ds
==NULL
|| inData
==NULL
|| length
<0 || (length
>0 && outData
==NULL
)) {
362 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
366 /* setup and swapping */
367 s
=(const uint8_t *)inData
;
368 t
=(uint8_t *)outData
;
372 if(c
!=0 && ((c
=asciiFromEbcdic
[c
])==0 || !UCHAR_IS_INVARIANT(c
))) {
373 udata_printError(ds
, "uprv_asciiFromEbcdic() string[%d] contains a variant character in position %d\n",
374 length
, length
-count
);
375 *pErrorCode
=U_INVALID_CHAR_FOUND
;
385 /* this function only checks and copies EBCDIC strings without conversion */
387 uprv_copyEbcdic(const UDataSwapper
*ds
,
388 const void *inData
, int32_t length
, void *outData
,
389 UErrorCode
*pErrorCode
) {
395 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
398 if(ds
==NULL
|| inData
==NULL
|| length
<0 || (length
>0 && outData
==NULL
)) {
399 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
403 /* setup and checking */
404 s
=(const uint8_t *)inData
;
408 if(c
!=0 && ((c
=asciiFromEbcdic
[c
])==0 || !UCHAR_IS_INVARIANT(c
))) {
409 udata_printError(ds
, "uprv_copyEbcdic() string[%] contains a variant character in position %d\n",
410 length
, length
-count
);
411 *pErrorCode
=U_INVALID_CHAR_FOUND
;
417 if(length
>0 && inData
!=outData
) {
418 uprv_memcpy(outData
, inData
, length
);
424 /* compare invariant strings; variant characters compare less than others and unlike each other */
426 uprv_compareInvAscii(const UDataSwapper
*ds
,
427 const char *outString
, int32_t outLength
,
428 const UChar
*localString
, int32_t localLength
) {
433 if(outString
==NULL
|| outLength
<-1 || localString
==NULL
|| localLength
<-1) {
438 outLength
=(int32_t)uprv_strlen(outString
);
441 localLength
=u_strlen(localString
);
444 minLength
= outLength
<localLength
? outLength
: localLength
;
447 c
=(uint8_t)*outString
++;
448 if(UCHAR_IS_INVARIANT(c
)) {
455 if(!UCHAR_IS_INVARIANT(c2
)) {
466 /* strings start with same prefix, compare lengths */
467 return outLength
-localLength
;
471 uprv_compareInvEbcdic(const UDataSwapper
*ds
,
472 const char *outString
, int32_t outLength
,
473 const UChar
*localString
, int32_t localLength
) {
478 if(outString
==NULL
|| outLength
<-1 || localString
==NULL
|| localLength
<-1) {
483 outLength
=(int32_t)uprv_strlen(outString
);
486 localLength
=u_strlen(localString
);
489 minLength
= outLength
<localLength
? outLength
: localLength
;
492 c
=(uint8_t)*outString
++;
495 } else if((c1
=asciiFromEbcdic
[c
])!=0 && UCHAR_IS_INVARIANT(c1
)) {
502 if(!UCHAR_IS_INVARIANT(c2
)) {
513 /* strings start with same prefix, compare lengths */
514 return outLength
-localLength
;