1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *****************************************************************************
6 * Copyright (C) 1998-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *****************************************************************************
12 * Implements error behaviour functions called by T_UConverter_{from,to}Unicode
17 * 06/29/2000 helena Major rewrite of the callback APIs.
20 #include "unicode/utypes.h"
22 #if !UCONFIG_NO_CONVERSION
24 #include "unicode/ucnv_err.h"
25 #include "unicode/ucnv_cb.h"
28 #include "unicode/ucnv.h"
31 #define VALUE_STRING_LENGTH 48
32 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
33 #define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025
34 #define UNICODE_U_CODEPOINT 0x0055
35 #define UNICODE_X_CODEPOINT 0x0058
36 #define UNICODE_RS_CODEPOINT 0x005C
37 #define UNICODE_U_LOW_CODEPOINT 0x0075
38 #define UNICODE_X_LOW_CODEPOINT 0x0078
39 #define UNICODE_AMP_CODEPOINT 0x0026
40 #define UNICODE_HASH_CODEPOINT 0x0023
41 #define UNICODE_SEMICOLON_CODEPOINT 0x003B
42 #define UNICODE_PLUS_CODEPOINT 0x002B
43 #define UNICODE_LEFT_CURLY_CODEPOINT 0x007B
44 #define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D
45 #define UNICODE_SPACE_CODEPOINT 0x0020
46 #define UCNV_PRV_ESCAPE_ICU 0
47 #define UCNV_PRV_ESCAPE_C 'C'
48 #define UCNV_PRV_ESCAPE_XML_DEC 'D'
49 #define UCNV_PRV_ESCAPE_XML_HEX 'X'
50 #define UCNV_PRV_ESCAPE_JAVA 'J'
51 #define UCNV_PRV_ESCAPE_UNICODE 'U'
52 #define UCNV_PRV_ESCAPE_CSS2 'S'
53 #define UCNV_PRV_STOP_ON_ILLEGAL 'i'
56 * IS_DEFAULT_IGNORABLE_CODE_POINT
57 * This is to check if a code point has the default ignorable unicode property.
58 * As such, this list needs to be updated if the ignorable code point list ever
60 * To avoid dependency on other code, this list is hard coded here.
61 * When an ignorable code point is found and is unmappable, the default callbacks
63 * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g=
65 * This list should be sync with the one in CharsetCallback.java
67 #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) (\
73 (0x17B4 <= c && c <= 0x17B5) || \
74 (0x180B <= c && c <= 0x180E) || \
75 (0x200B <= c && c <= 0x200F) || \
76 (0x202A <= c && c <= 0x202E) || \
78 (0x2066 <= c && c <= 0x2069) || \
79 (0x2061 <= c && c <= 0x2064) || \
80 (0x206A <= c && c <= 0x206F) || \
82 (0x0FE00 <= c && c <= 0x0FE0F) || \
85 (0x01BCA0 <= c && c <= 0x01BCA3) || \
86 (0x01D173 <= c && c <= 0x01D17A) || \
88 (0x0E0020 <= c && c <= 0x0E007F) || \
89 (0x0E0100 <= c && c <= 0x0E01EF) || \
91 (0x0FFF0 <= c && c <= 0x0FFF8) || \
93 (0x0E0002 <= c && c <= 0x0E001F) || \
94 (0x0E0080 <= c && c <= 0x0E00FF) || \
95 (0x0E01F0 <= c && c <= 0x0E0FFF) \
99 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
100 U_CAPI
void U_EXPORT2
101 UCNV_FROM_U_CALLBACK_STOP (
103 UConverterFromUnicodeArgs
*fromUArgs
,
104 const UChar
* codeUnits
,
107 UConverterCallbackReason reason
,
114 if (reason
== UCNV_UNASSIGNED
&& IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint
))
117 * Skip if the codepoint has unicode property of default ignorable.
121 /* the caller must have set the error code accordingly */
126 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
127 U_CAPI
void U_EXPORT2
128 UCNV_TO_U_CALLBACK_STOP (
130 UConverterToUnicodeArgs
*toUArgs
,
131 const char* codePoints
,
133 UConverterCallbackReason reason
,
136 /* the caller must have set the error code accordingly */
137 (void)context
; (void)toUArgs
; (void)codePoints
; (void)length
; (void)reason
; (void)err
;
141 U_CAPI
void U_EXPORT2
142 UCNV_FROM_U_CALLBACK_SKIP (
144 UConverterFromUnicodeArgs
*fromUArgs
,
145 const UChar
* codeUnits
,
148 UConverterCallbackReason reason
,
154 if (reason
<= UCNV_IRREGULAR
)
156 if (reason
== UCNV_UNASSIGNED
&& IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint
))
159 * Skip if the codepoint has unicode property of default ignorable.
163 else if (context
== NULL
|| (*((char*)context
) == UCNV_PRV_STOP_ON_ILLEGAL
&& reason
== UCNV_UNASSIGNED
))
167 /* else the caller must have set the error code accordingly. */
169 /* else ignore the reset, close and clone calls. */
172 U_CAPI
void U_EXPORT2
173 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
175 UConverterFromUnicodeArgs
*fromArgs
,
176 const UChar
* codeUnits
,
179 UConverterCallbackReason reason
,
184 if (reason
<= UCNV_IRREGULAR
)
186 if (reason
== UCNV_UNASSIGNED
&& IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint
))
189 * Skip if the codepoint has unicode property of default ignorable.
193 else if (context
== NULL
|| (*((char*)context
) == UCNV_PRV_STOP_ON_ILLEGAL
&& reason
== UCNV_UNASSIGNED
))
196 ucnv_cbFromUWriteSub(fromArgs
, 0, err
);
198 /* else the caller must have set the error code accordingly. */
200 /* else ignore the reset, close and clone calls. */
203 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
204 *uses a clean copy (resetted) of the converter, to convert that unicode
205 *escape sequence to the target codepage (if conversion failure happens then
206 *we revert to substituting with subchar)
208 U_CAPI
void U_EXPORT2
209 UCNV_FROM_U_CALLBACK_ESCAPE (
211 UConverterFromUnicodeArgs
*fromArgs
,
212 const UChar
*codeUnits
,
215 UConverterCallbackReason reason
,
219 UChar valueString
[VALUE_STRING_LENGTH
];
220 int32_t valueStringLength
= 0;
223 const UChar
*myValueSource
= NULL
;
224 UErrorCode err2
= U_ZERO_ERROR
;
225 UConverterFromUCallback original
= NULL
;
226 const void *originalContext
;
228 UConverterFromUCallback ignoredCallback
= NULL
;
229 const void *ignoredContext
;
231 if (reason
> UCNV_IRREGULAR
)
235 else if (reason
== UCNV_UNASSIGNED
&& IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint
))
238 * Skip if the codepoint has unicode property of default ignorable.
244 ucnv_setFromUCallBack (fromArgs
->converter
,
245 (UConverterFromUCallback
) UCNV_FROM_U_CALLBACK_SUBSTITUTE
,
251 if (U_FAILURE (err2
))
260 valueString
[valueStringLength
++] = (UChar
) UNICODE_PERCENT_SIGN_CODEPOINT
; /* adding % */
261 valueString
[valueStringLength
++] = (UChar
) UNICODE_U_CODEPOINT
; /* adding U */
262 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint16_t)codeUnits
[i
++], 16, 4);
267 switch(*((char*)context
))
269 case UCNV_PRV_ESCAPE_JAVA
:
272 valueString
[valueStringLength
++] = (UChar
) UNICODE_RS_CODEPOINT
; /* adding \ */
273 valueString
[valueStringLength
++] = (UChar
) UNICODE_U_LOW_CODEPOINT
; /* adding u */
274 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint16_t)codeUnits
[i
++], 16, 4);
278 case UCNV_PRV_ESCAPE_C
:
279 valueString
[valueStringLength
++] = (UChar
) UNICODE_RS_CODEPOINT
; /* adding \ */
282 valueString
[valueStringLength
++] = (UChar
) UNICODE_U_CODEPOINT
; /* adding U */
283 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, codePoint
, 16, 8);
287 valueString
[valueStringLength
++] = (UChar
) UNICODE_U_LOW_CODEPOINT
; /* adding u */
288 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint16_t)codeUnits
[0], 16, 4);
292 case UCNV_PRV_ESCAPE_XML_DEC
:
294 valueString
[valueStringLength
++] = (UChar
) UNICODE_AMP_CODEPOINT
; /* adding & */
295 valueString
[valueStringLength
++] = (UChar
) UNICODE_HASH_CODEPOINT
; /* adding # */
297 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, codePoint
, 10, 0);
300 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint16_t)codeUnits
[0], 10, 0);
302 valueString
[valueStringLength
++] = (UChar
) UNICODE_SEMICOLON_CODEPOINT
; /* adding ; */
305 case UCNV_PRV_ESCAPE_XML_HEX
:
307 valueString
[valueStringLength
++] = (UChar
) UNICODE_AMP_CODEPOINT
; /* adding & */
308 valueString
[valueStringLength
++] = (UChar
) UNICODE_HASH_CODEPOINT
; /* adding # */
309 valueString
[valueStringLength
++] = (UChar
) UNICODE_X_LOW_CODEPOINT
; /* adding x */
311 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, codePoint
, 16, 0);
314 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint16_t)codeUnits
[0], 16, 0);
316 valueString
[valueStringLength
++] = (UChar
) UNICODE_SEMICOLON_CODEPOINT
; /* adding ; */
319 case UCNV_PRV_ESCAPE_UNICODE
:
320 valueString
[valueStringLength
++] = (UChar
) UNICODE_LEFT_CURLY_CODEPOINT
; /* adding { */
321 valueString
[valueStringLength
++] = (UChar
) UNICODE_U_CODEPOINT
; /* adding U */
322 valueString
[valueStringLength
++] = (UChar
) UNICODE_PLUS_CODEPOINT
; /* adding + */
324 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, codePoint
, 16, 4);
326 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint16_t)codeUnits
[0], 16, 4);
328 valueString
[valueStringLength
++] = (UChar
) UNICODE_RIGHT_CURLY_CODEPOINT
; /* adding } */
331 case UCNV_PRV_ESCAPE_CSS2
:
332 valueString
[valueStringLength
++] = (UChar
) UNICODE_RS_CODEPOINT
; /* adding \ */
333 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, codePoint
, 16, 0);
334 /* Always add space character, becase the next character might be whitespace,
335 which would erroneously be considered the termination of the escape sequence. */
336 valueString
[valueStringLength
++] = (UChar
) UNICODE_SPACE_CODEPOINT
;
342 valueString
[valueStringLength
++] = (UChar
) UNICODE_PERCENT_SIGN_CODEPOINT
; /* adding % */
343 valueString
[valueStringLength
++] = (UChar
) UNICODE_U_CODEPOINT
; /* adding U */
344 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint16_t)codeUnits
[i
++], 16, 4);
348 myValueSource
= valueString
;
350 /* reset the error */
353 ucnv_cbFromUWriteUChars(fromArgs
, &myValueSource
, myValueSource
+valueStringLength
, 0, err
);
355 ucnv_setFromUCallBack (fromArgs
->converter
,
361 if (U_FAILURE (err2
))
372 U_CAPI
void U_EXPORT2
373 UCNV_TO_U_CALLBACK_SKIP (
375 UConverterToUnicodeArgs
*toArgs
,
376 const char* codeUnits
,
378 UConverterCallbackReason reason
,
384 if (reason
<= UCNV_IRREGULAR
)
386 if (context
== NULL
|| (*((char*)context
) == UCNV_PRV_STOP_ON_ILLEGAL
&& reason
== UCNV_UNASSIGNED
))
390 /* else the caller must have set the error code accordingly. */
392 /* else ignore the reset, close and clone calls. */
395 U_CAPI
void U_EXPORT2
396 UCNV_TO_U_CALLBACK_SUBSTITUTE (
398 UConverterToUnicodeArgs
*toArgs
,
399 const char* codeUnits
,
401 UConverterCallbackReason reason
,
406 if (reason
<= UCNV_IRREGULAR
)
408 if (context
== NULL
|| (*((char*)context
) == UCNV_PRV_STOP_ON_ILLEGAL
&& reason
== UCNV_UNASSIGNED
))
411 ucnv_cbToUWriteSub(toArgs
,0,err
);
413 /* else the caller must have set the error code accordingly. */
415 /* else ignore the reset, close and clone calls. */
418 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
419 *and uses that as the substitution sequence
421 U_CAPI
void U_EXPORT2
422 UCNV_TO_U_CALLBACK_ESCAPE (
424 UConverterToUnicodeArgs
*toArgs
,
425 const char* codeUnits
,
427 UConverterCallbackReason reason
,
430 UChar uniValueString
[VALUE_STRING_LENGTH
];
431 int32_t valueStringLength
= 0;
434 if (reason
> UCNV_IRREGULAR
)
443 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_PERCENT_SIGN_CODEPOINT
; /* adding % */
444 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_X_CODEPOINT
; /* adding X */
445 valueStringLength
+= uprv_itou (uniValueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint8_t) codeUnits
[i
++], 16, 2);
450 switch(*((char*)context
))
452 case UCNV_PRV_ESCAPE_XML_DEC
:
455 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_AMP_CODEPOINT
; /* adding & */
456 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_HASH_CODEPOINT
; /* adding # */
457 valueStringLength
+= uprv_itou (uniValueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint8_t)codeUnits
[i
++], 10, 0);
458 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_SEMICOLON_CODEPOINT
; /* adding ; */
462 case UCNV_PRV_ESCAPE_XML_HEX
:
465 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_AMP_CODEPOINT
; /* adding & */
466 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_HASH_CODEPOINT
; /* adding # */
467 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_X_LOW_CODEPOINT
; /* adding x */
468 valueStringLength
+= uprv_itou (uniValueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint8_t)codeUnits
[i
++], 16, 0);
469 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_SEMICOLON_CODEPOINT
; /* adding ; */
472 case UCNV_PRV_ESCAPE_C
:
475 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_RS_CODEPOINT
; /* adding \ */
476 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_X_LOW_CODEPOINT
; /* adding x */
477 valueStringLength
+= uprv_itou (uniValueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint8_t)codeUnits
[i
++], 16, 2);
483 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_PERCENT_SIGN_CODEPOINT
; /* adding % */
484 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_X_CODEPOINT
; /* adding X */
485 uprv_itou (uniValueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint8_t) codeUnits
[i
++], 16, 2);
486 valueStringLength
+= 2;
490 /* reset the error */
493 ucnv_cbToUWriteUChars(toArgs
, uniValueString
, valueStringLength
, 0, err
);