1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *****************************************************************************
6 * Copyright (C) 1998-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *****************************************************************************
12 * Implements error behaviour functions called by T_UConverter_{from,to}Unicode
17 * 06/29/2000 helena Major rewrite of the callback APIs.
20 #include "unicode/utypes.h"
22 #if !UCONFIG_NO_CONVERSION
24 #include "unicode/ucnv_err.h"
25 #include "unicode/ucnv_cb.h"
28 #include "unicode/ucnv.h"
31 #define VALUE_STRING_LENGTH 48
32 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
33 #define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025
34 #define UNICODE_U_CODEPOINT 0x0055
35 #define UNICODE_X_CODEPOINT 0x0058
36 #define UNICODE_RS_CODEPOINT 0x005C
37 #define UNICODE_U_LOW_CODEPOINT 0x0075
38 #define UNICODE_X_LOW_CODEPOINT 0x0078
39 #define UNICODE_AMP_CODEPOINT 0x0026
40 #define UNICODE_HASH_CODEPOINT 0x0023
41 #define UNICODE_SEMICOLON_CODEPOINT 0x003B
42 #define UNICODE_PLUS_CODEPOINT 0x002B
43 #define UNICODE_LEFT_CURLY_CODEPOINT 0x007B
44 #define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D
45 #define UNICODE_SPACE_CODEPOINT 0x0020
46 #define UCNV_PRV_ESCAPE_ICU 0
47 #define UCNV_PRV_ESCAPE_C 'C'
48 #define UCNV_PRV_ESCAPE_XML_DEC 'D'
49 #define UCNV_PRV_ESCAPE_XML_HEX 'X'
50 #define UCNV_PRV_ESCAPE_JAVA 'J'
51 #define UCNV_PRV_ESCAPE_UNICODE 'U'
52 #define UCNV_PRV_ESCAPE_CSS2 'S'
53 #define UCNV_PRV_STOP_ON_ILLEGAL 'i'
56 * IS_DEFAULT_IGNORABLE_CODE_POINT
57 * This is to check if a code point has the default ignorable unicode property.
58 * As such, this list needs to be updated if the ignorable code point list ever
60 * To avoid dependency on other code, this list is hard coded here.
61 * When an ignorable code point is found and is unmappable, the default callbacks
63 * For a list of the default ignorable code points, use this link:
64 * https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3ADI%3A%5D&abb=on&g=&i=
66 * This list should be sync with the one in CharsetCallback.java
68 #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) ( \
74 (0x17B4 <= c && c <= 0x17B5) || \
75 (0x180B <= c && c <= 0x180E) || \
76 (0x200B <= c && c <= 0x200F) || \
77 (0x202A <= c && c <= 0x202E) || \
78 (0x2060 <= c && c <= 0x206F) || \
80 (0xFE00 <= c && c <= 0xFE0F) || \
83 (0xFFF0 <= c && c <= 0xFFF8) || \
84 (0x1BCA0 <= c && c <= 0x1BCA3) || \
85 (0x1D173 <= c && c <= 0x1D17A) || \
86 (0xE0000 <= c && c <= 0xE0FFF))
89 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
91 UCNV_FROM_U_CALLBACK_STOP (
93 UConverterFromUnicodeArgs
*fromUArgs
,
94 const UChar
* codeUnits
,
97 UConverterCallbackReason reason
,
104 if (reason
== UCNV_UNASSIGNED
&& IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint
))
107 * Skip if the codepoint has unicode property of default ignorable.
111 /* the caller must have set the error code accordingly */
116 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
117 U_CAPI
void U_EXPORT2
118 UCNV_TO_U_CALLBACK_STOP (
120 UConverterToUnicodeArgs
*toUArgs
,
121 const char* codePoints
,
123 UConverterCallbackReason reason
,
126 /* the caller must have set the error code accordingly */
127 (void)context
; (void)toUArgs
; (void)codePoints
; (void)length
; (void)reason
; (void)err
;
131 U_CAPI
void U_EXPORT2
132 UCNV_FROM_U_CALLBACK_SKIP (
134 UConverterFromUnicodeArgs
*fromUArgs
,
135 const UChar
* codeUnits
,
138 UConverterCallbackReason reason
,
144 if (reason
<= UCNV_IRREGULAR
)
146 if (reason
== UCNV_UNASSIGNED
&& IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint
))
149 * Skip if the codepoint has unicode property of default ignorable.
153 else if (context
== NULL
|| (*((char*)context
) == UCNV_PRV_STOP_ON_ILLEGAL
&& reason
== UCNV_UNASSIGNED
))
157 /* else the caller must have set the error code accordingly. */
159 /* else ignore the reset, close and clone calls. */
162 U_CAPI
void U_EXPORT2
163 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
165 UConverterFromUnicodeArgs
*fromArgs
,
166 const UChar
* codeUnits
,
169 UConverterCallbackReason reason
,
174 if (reason
<= UCNV_IRREGULAR
)
176 if (reason
== UCNV_UNASSIGNED
&& IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint
))
179 * Skip if the codepoint has unicode property of default ignorable.
183 else if (context
== NULL
|| (*((char*)context
) == UCNV_PRV_STOP_ON_ILLEGAL
&& reason
== UCNV_UNASSIGNED
))
186 ucnv_cbFromUWriteSub(fromArgs
, 0, err
);
188 /* else the caller must have set the error code accordingly. */
190 /* else ignore the reset, close and clone calls. */
193 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
194 *uses a clean copy (resetted) of the converter, to convert that unicode
195 *escape sequence to the target codepage (if conversion failure happens then
196 *we revert to substituting with subchar)
198 U_CAPI
void U_EXPORT2
199 UCNV_FROM_U_CALLBACK_ESCAPE (
201 UConverterFromUnicodeArgs
*fromArgs
,
202 const UChar
*codeUnits
,
205 UConverterCallbackReason reason
,
209 UChar valueString
[VALUE_STRING_LENGTH
];
210 int32_t valueStringLength
= 0;
213 const UChar
*myValueSource
= NULL
;
214 UErrorCode err2
= U_ZERO_ERROR
;
215 UConverterFromUCallback original
= NULL
;
216 const void *originalContext
;
218 UConverterFromUCallback ignoredCallback
= NULL
;
219 const void *ignoredContext
;
221 if (reason
> UCNV_IRREGULAR
)
225 else if (reason
== UCNV_UNASSIGNED
&& IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint
))
228 * Skip if the codepoint has unicode property of default ignorable.
234 ucnv_setFromUCallBack (fromArgs
->converter
,
235 (UConverterFromUCallback
) UCNV_FROM_U_CALLBACK_SUBSTITUTE
,
241 if (U_FAILURE (err2
))
250 valueString
[valueStringLength
++] = (UChar
) UNICODE_PERCENT_SIGN_CODEPOINT
; /* adding % */
251 valueString
[valueStringLength
++] = (UChar
) UNICODE_U_CODEPOINT
; /* adding U */
252 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint16_t)codeUnits
[i
++], 16, 4);
257 switch(*((char*)context
))
259 case UCNV_PRV_ESCAPE_JAVA
:
262 valueString
[valueStringLength
++] = (UChar
) UNICODE_RS_CODEPOINT
; /* adding \ */
263 valueString
[valueStringLength
++] = (UChar
) UNICODE_U_LOW_CODEPOINT
; /* adding u */
264 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint16_t)codeUnits
[i
++], 16, 4);
268 case UCNV_PRV_ESCAPE_C
:
269 valueString
[valueStringLength
++] = (UChar
) UNICODE_RS_CODEPOINT
; /* adding \ */
272 valueString
[valueStringLength
++] = (UChar
) UNICODE_U_CODEPOINT
; /* adding U */
273 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, codePoint
, 16, 8);
277 valueString
[valueStringLength
++] = (UChar
) UNICODE_U_LOW_CODEPOINT
; /* adding u */
278 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint16_t)codeUnits
[0], 16, 4);
282 case UCNV_PRV_ESCAPE_XML_DEC
:
284 valueString
[valueStringLength
++] = (UChar
) UNICODE_AMP_CODEPOINT
; /* adding & */
285 valueString
[valueStringLength
++] = (UChar
) UNICODE_HASH_CODEPOINT
; /* adding # */
287 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, codePoint
, 10, 0);
290 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint16_t)codeUnits
[0], 10, 0);
292 valueString
[valueStringLength
++] = (UChar
) UNICODE_SEMICOLON_CODEPOINT
; /* adding ; */
295 case UCNV_PRV_ESCAPE_XML_HEX
:
297 valueString
[valueStringLength
++] = (UChar
) UNICODE_AMP_CODEPOINT
; /* adding & */
298 valueString
[valueStringLength
++] = (UChar
) UNICODE_HASH_CODEPOINT
; /* adding # */
299 valueString
[valueStringLength
++] = (UChar
) UNICODE_X_LOW_CODEPOINT
; /* adding x */
301 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, codePoint
, 16, 0);
304 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint16_t)codeUnits
[0], 16, 0);
306 valueString
[valueStringLength
++] = (UChar
) UNICODE_SEMICOLON_CODEPOINT
; /* adding ; */
309 case UCNV_PRV_ESCAPE_UNICODE
:
310 valueString
[valueStringLength
++] = (UChar
) UNICODE_LEFT_CURLY_CODEPOINT
; /* adding { */
311 valueString
[valueStringLength
++] = (UChar
) UNICODE_U_CODEPOINT
; /* adding U */
312 valueString
[valueStringLength
++] = (UChar
) UNICODE_PLUS_CODEPOINT
; /* adding + */
314 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, codePoint
, 16, 4);
316 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint16_t)codeUnits
[0], 16, 4);
318 valueString
[valueStringLength
++] = (UChar
) UNICODE_RIGHT_CURLY_CODEPOINT
; /* adding } */
321 case UCNV_PRV_ESCAPE_CSS2
:
322 valueString
[valueStringLength
++] = (UChar
) UNICODE_RS_CODEPOINT
; /* adding \ */
323 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, codePoint
, 16, 0);
324 /* Always add space character, becase the next character might be whitespace,
325 which would erroneously be considered the termination of the escape sequence. */
326 valueString
[valueStringLength
++] = (UChar
) UNICODE_SPACE_CODEPOINT
;
332 valueString
[valueStringLength
++] = (UChar
) UNICODE_PERCENT_SIGN_CODEPOINT
; /* adding % */
333 valueString
[valueStringLength
++] = (UChar
) UNICODE_U_CODEPOINT
; /* adding U */
334 valueStringLength
+= uprv_itou (valueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint16_t)codeUnits
[i
++], 16, 4);
338 myValueSource
= valueString
;
340 /* reset the error */
343 ucnv_cbFromUWriteUChars(fromArgs
, &myValueSource
, myValueSource
+valueStringLength
, 0, err
);
345 ucnv_setFromUCallBack (fromArgs
->converter
,
351 if (U_FAILURE (err2
))
362 U_CAPI
void U_EXPORT2
363 UCNV_TO_U_CALLBACK_SKIP (
365 UConverterToUnicodeArgs
*toArgs
,
366 const char* codeUnits
,
368 UConverterCallbackReason reason
,
374 if (reason
<= UCNV_IRREGULAR
)
376 if (context
== NULL
|| (*((char*)context
) == UCNV_PRV_STOP_ON_ILLEGAL
&& reason
== UCNV_UNASSIGNED
))
380 /* else the caller must have set the error code accordingly. */
382 /* else ignore the reset, close and clone calls. */
385 U_CAPI
void U_EXPORT2
386 UCNV_TO_U_CALLBACK_SUBSTITUTE (
388 UConverterToUnicodeArgs
*toArgs
,
389 const char* codeUnits
,
391 UConverterCallbackReason reason
,
396 if (reason
<= UCNV_IRREGULAR
)
398 if (context
== NULL
|| (*((char*)context
) == UCNV_PRV_STOP_ON_ILLEGAL
&& reason
== UCNV_UNASSIGNED
))
401 ucnv_cbToUWriteSub(toArgs
,0,err
);
403 /* else the caller must have set the error code accordingly. */
405 /* else ignore the reset, close and clone calls. */
408 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
409 *and uses that as the substitution sequence
411 U_CAPI
void U_EXPORT2
412 UCNV_TO_U_CALLBACK_ESCAPE (
414 UConverterToUnicodeArgs
*toArgs
,
415 const char* codeUnits
,
417 UConverterCallbackReason reason
,
420 UChar uniValueString
[VALUE_STRING_LENGTH
];
421 int32_t valueStringLength
= 0;
424 if (reason
> UCNV_IRREGULAR
)
433 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_PERCENT_SIGN_CODEPOINT
; /* adding % */
434 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_X_CODEPOINT
; /* adding X */
435 valueStringLength
+= uprv_itou (uniValueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint8_t) codeUnits
[i
++], 16, 2);
440 switch(*((char*)context
))
442 case UCNV_PRV_ESCAPE_XML_DEC
:
445 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_AMP_CODEPOINT
; /* adding & */
446 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_HASH_CODEPOINT
; /* adding # */
447 valueStringLength
+= uprv_itou (uniValueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint8_t)codeUnits
[i
++], 10, 0);
448 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_SEMICOLON_CODEPOINT
; /* adding ; */
452 case UCNV_PRV_ESCAPE_XML_HEX
:
455 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_AMP_CODEPOINT
; /* adding & */
456 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_HASH_CODEPOINT
; /* adding # */
457 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_X_LOW_CODEPOINT
; /* adding x */
458 valueStringLength
+= uprv_itou (uniValueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint8_t)codeUnits
[i
++], 16, 0);
459 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_SEMICOLON_CODEPOINT
; /* adding ; */
462 case UCNV_PRV_ESCAPE_C
:
465 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_RS_CODEPOINT
; /* adding \ */
466 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_X_LOW_CODEPOINT
; /* adding x */
467 valueStringLength
+= uprv_itou (uniValueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint8_t)codeUnits
[i
++], 16, 2);
473 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_PERCENT_SIGN_CODEPOINT
; /* adding % */
474 uniValueString
[valueStringLength
++] = (UChar
) UNICODE_X_CODEPOINT
; /* adding X */
475 uprv_itou (uniValueString
+ valueStringLength
, VALUE_STRING_LENGTH
- valueStringLength
, (uint8_t) codeUnits
[i
++], 16, 2);
476 valueStringLength
+= 2;
480 /* reset the error */
483 ucnv_cbToUWriteUChars(toArgs
, uniValueString
, valueStringLength
, 0, err
);