icuSources/common/ucnv_err.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4  *****************************************************************************
   5  *
   6  *   Copyright (C) 1998-2016, International Business Machines
   7  *   Corporation and others.  All Rights Reserved.
   8  *
   9  *****************************************************************************
  10  *
  11  *  ucnv_err.c
  12  *  Implements error behaviour functions called by T_UConverter_{from,to}Unicode
  13  *
  14  *
  15 *   Change history:
  16 *
  17 *   06/29/2000  helena      Major rewrite of the callback APIs.
  18 */
  19
  20 #include "unicode/utypes.h"
  21
  22 #if !UCONFIG_NO_CONVERSION
  23
  24 #include "unicode/ucnv_err.h"
  25 #include "unicode/ucnv_cb.h"
  26 #include "ucnv_cnv.h"
  27 #include "cmemory.h"
  28 #include "unicode/ucnv.h"
  29 #include "ustrfmt.h"
  30
  31 #define VALUE_STRING_LENGTH 48
  32 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
  33 #define UNICODE_PERCENT_SIGN_CODEPOINT  0x0025
  34 #define UNICODE_U_CODEPOINT             0x0055
  35 #define UNICODE_X_CODEPOINT             0x0058
  36 #define UNICODE_RS_CODEPOINT            0x005C
  37 #define UNICODE_U_LOW_CODEPOINT         0x0075
  38 #define UNICODE_X_LOW_CODEPOINT         0x0078
  39 #define UNICODE_AMP_CODEPOINT           0x0026
  40 #define UNICODE_HASH_CODEPOINT          0x0023
  41 #define UNICODE_SEMICOLON_CODEPOINT     0x003B
  42 #define UNICODE_PLUS_CODEPOINT          0x002B
  43 #define UNICODE_LEFT_CURLY_CODEPOINT    0x007B
  44 #define UNICODE_RIGHT_CURLY_CODEPOINT   0x007D
  45 #define UNICODE_SPACE_CODEPOINT         0x0020
  46 #define UCNV_PRV_ESCAPE_ICU         0
  47 #define UCNV_PRV_ESCAPE_C           'C'
  48 #define UCNV_PRV_ESCAPE_XML_DEC     'D'
  49 #define UCNV_PRV_ESCAPE_XML_HEX     'X'
  50 #define UCNV_PRV_ESCAPE_JAVA        'J'
  51 #define UCNV_PRV_ESCAPE_UNICODE     'U'
  52 #define UCNV_PRV_ESCAPE_CSS2        'S'
  53 #define UCNV_PRV_STOP_ON_ILLEGAL    'i'
  54
  55 /*
  56  * IS_DEFAULT_IGNORABLE_CODE_POINT
  57  * This is to check if a code point has the default ignorable unicode property.
  58  * As such, this list needs to be updated if the ignorable code point list ever
  59  * changes.
  60  * To avoid dependency on other code, this list is hard coded here.
  61  * When an ignorable code point is found and is unmappable, the default callbacks
  62  * will ignore them.
  63  * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g=
  64  *
  65  * This list should be sync with the one in CharsetCallback.java
  66  */
  67 #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) (\
  68     (c == 0x00AD) || \
  69     (c == 0x034F) || \
  70     (c == 0x061C) || \
  71     (c == 0x115F) || \
  72     (c == 0x1160) || \
  73     (0x17B4 <= c && c <= 0x17B5) || \
  74     (0x180B <= c && c <= 0x180E) || \
  75     (0x200B <= c && c <= 0x200F) || \
  76     (0x202A <= c && c <= 0x202E) || \
  77     (c == 0x2060) || \
  78     (0x2066 <= c && c <= 0x2069) || \
  79     (0x2061 <= c && c <= 0x2064) || \
  80     (0x206A <= c && c <= 0x206F) || \
  81     (c == 0x3164) || \
  82     (0x0FE00 <= c && c <= 0x0FE0F) || \
  83     (c == 0x0FEFF) || \
  84     (c == 0x0FFA0) || \
  85     (0x01BCA0  <= c && c <= 0x01BCA3) || \
  86     (0x01D173 <= c && c <= 0x01D17A) || \
  87     (c == 0x0E0001) || \
  88     (0x0E0020 <= c && c <= 0x0E007F) || \
  89     (0x0E0100 <= c && c <= 0x0E01EF) || \
  90     (c == 0x2065) || \
  91     (0x0FFF0 <= c && c <= 0x0FFF8) || \
  92     (c == 0x0E0000) || \
  93     (0x0E0002 <= c && c <= 0x0E001F) || \
  94     (0x0E0080 <= c && c <= 0x0E00FF) || \
  95     (0x0E01F0 <= c && c <= 0x0E0FFF) \
  96     )
  97
  98
  99 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
 100 U_CAPI void    U_EXPORT2
 101 UCNV_FROM_U_CALLBACK_STOP (
 102                   const void *context,
 103                   UConverterFromUnicodeArgs *fromUArgs,
 104                   const UChar* codeUnits,
 105                   int32_t length,
 106                   UChar32 codePoint,
 107                   UConverterCallbackReason reason,
 108                   UErrorCode * err)
 109 {
 110     (void)context;
 111     (void)fromUArgs;
 112     (void)codeUnits;
 113     (void)length;
 114     if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
 115     {
 116         /*
 117          * Skip if the codepoint has unicode property of default ignorable.
 118          */
 119         *err = U_ZERO_ERROR;
 120     }
 121     /* the caller must have set the error code accordingly */
 122     return;
 123 }
 124
 125
 126 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
 127 U_CAPI void    U_EXPORT2
 128 UCNV_TO_U_CALLBACK_STOP (
 129                    const void *context,
 130                    UConverterToUnicodeArgs *toUArgs,
 131                    const char* codePoints,
 132                    int32_t length,
 133                    UConverterCallbackReason reason,
 134                    UErrorCode * err)
 135 {
 136     /* the caller must have set the error code accordingly */
 137     (void)context; (void)toUArgs; (void)codePoints; (void)length; (void)reason; (void)err;
 138     return;
 139 }
 140
 141 U_CAPI void    U_EXPORT2
 142 UCNV_FROM_U_CALLBACK_SKIP (
 143                   const void *context,
 144                   UConverterFromUnicodeArgs *fromUArgs,
 145                   const UChar* codeUnits,
 146                   int32_t length,
 147                   UChar32 codePoint,
 148                   UConverterCallbackReason reason,
 149                   UErrorCode * err)
 150 {
 151     (void)fromUArgs;
 152     (void)codeUnits;
 153     (void)length;
 154     if (reason <= UCNV_IRREGULAR)
 155     {
 156         if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
 157         {
 158             /*
 159              * Skip if the codepoint has unicode property of default ignorable.
 160              */
 161             *err = U_ZERO_ERROR;
 162         }
 163         else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
 164         {
 165             *err = U_ZERO_ERROR;
 166         }
 167         /* else the caller must have set the error code accordingly. */
 168     }
 169     /* else ignore the reset, close and clone calls. */
 170 }
 171
 172 U_CAPI void    U_EXPORT2
 173 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
 174                   const void *context,
 175                   UConverterFromUnicodeArgs *fromArgs,
 176                   const UChar* codeUnits,
 177                   int32_t length,
 178                   UChar32 codePoint,
 179                   UConverterCallbackReason reason,
 180                   UErrorCode * err)
 181 {
 182     (void)codeUnits;
 183     (void)length;
 184     if (reason <= UCNV_IRREGULAR)
 185     {
 186         if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
 187         {
 188             /*
 189              * Skip if the codepoint has unicode property of default ignorable.
 190              */
 191             *err = U_ZERO_ERROR;
 192         }
 193         else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
 194         {
 195             *err = U_ZERO_ERROR;
 196             ucnv_cbFromUWriteSub(fromArgs, 0, err);
 197         }
 198         /* else the caller must have set the error code accordingly. */
 199     }
 200     /* else ignore the reset, close and clone calls. */
 201 }
 202
 203 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
 204  *uses a clean copy (resetted) of the converter, to convert that unicode
 205  *escape sequence to the target codepage (if conversion failure happens then
 206  *we revert to substituting with subchar)
 207  */
 208 U_CAPI void    U_EXPORT2
 209 UCNV_FROM_U_CALLBACK_ESCAPE (
 210                          const void *context,
 211                          UConverterFromUnicodeArgs *fromArgs,
 212                          const UChar *codeUnits,
 213                          int32_t length,
 214                          UChar32 codePoint,
 215                          UConverterCallbackReason reason,
 216                          UErrorCode * err)
 217 {
 218
 219   UChar valueString[VALUE_STRING_LENGTH];
 220   int32_t valueStringLength = 0;
 221   int32_t i = 0;
 222
 223   const UChar *myValueSource = NULL;
 224   UErrorCode err2 = U_ZERO_ERROR;
 225   UConverterFromUCallback original = NULL;
 226   const void *originalContext;
 227
 228   UConverterFromUCallback ignoredCallback = NULL;
 229   const void *ignoredContext;
 230
 231   if (reason > UCNV_IRREGULAR)
 232   {
 233       return;
 234   }
 235   else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
 236   {
 237       /*
 238        * Skip if the codepoint has unicode property of default ignorable.
 239        */
 240       *err = U_ZERO_ERROR;
 241       return;
 242   }
 243
 244   ucnv_setFromUCallBack (fromArgs->converter,
 245                      (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
 246                      NULL,
 247                      &original,
 248                      &originalContext,
 249                      &err2);
 250
 251   if (U_FAILURE (err2))
 252   {
 253     *err = err2;
 254     return;
 255   }
 256   if(context==NULL)
 257   {
 258       while (i < length)
 259       {
 260         valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
 261         valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
 262         valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
 263       }
 264   }
 265   else
 266   {
 267       switch(*((char*)context))
 268       {
 269       case UCNV_PRV_ESCAPE_JAVA:
 270           while (i < length)
 271           {
 272               valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
 273               valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
 274               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
 275           }
 276           break;
 277
 278       case UCNV_PRV_ESCAPE_C:
 279           valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
 280
 281           if(length==2){
 282               valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
 283               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
 284
 285           }
 286           else{
 287               valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
 288               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
 289           }
 290           break;
 291
 292       case UCNV_PRV_ESCAPE_XML_DEC:
 293
 294           valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
 295           valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
 296           if(length==2){
 297               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
 298           }
 299           else{
 300               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
 301           }
 302           valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
 303           break;
 304
 305       case UCNV_PRV_ESCAPE_XML_HEX:
 306
 307           valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
 308           valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
 309           valueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
 310           if(length==2){
 311               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
 312           }
 313           else{
 314               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
 315           }
 316           valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
 317           break;
 318
 319       case UCNV_PRV_ESCAPE_UNICODE:
 320           valueString[valueStringLength++] = (UChar) UNICODE_LEFT_CURLY_CODEPOINT;    /* adding { */
 321           valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT;    /* adding U */
 322           valueString[valueStringLength++] = (UChar) UNICODE_PLUS_CODEPOINT; /* adding + */
 323           if (length == 2) {
 324               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
 325           } else {
 326               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
 327           }
 328           valueString[valueStringLength++] = (UChar) UNICODE_RIGHT_CURLY_CODEPOINT;    /* adding } */
 329           break;
 330
 331       case UCNV_PRV_ESCAPE_CSS2:
 332           valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
 333           valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
 334           /* Always add space character, becase the next character might be whitespace,
 335              which would erroneously be considered the termination of the escape sequence. */
 336           valueString[valueStringLength++] = (UChar) UNICODE_SPACE_CODEPOINT;
 337           break;
 338
 339       default:
 340           while (i < length)
 341           {
 342               valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
 343               valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT;             /* adding U */
 344               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
 345           }
 346       }
 347   }
 348   myValueSource = valueString;
 349
 350   /* reset the error */
 351   *err = U_ZERO_ERROR;
 352
 353   ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
 354
 355   ucnv_setFromUCallBack (fromArgs->converter,
 356                          original,
 357                          originalContext,
 358                          &ignoredCallback,
 359                          &ignoredContext,
 360                          &err2);
 361   if (U_FAILURE (err2))
 362   {
 363       *err = err2;
 364       return;
 365   }
 366
 367   return;
 368 }
 369
 370
 371
 372 U_CAPI void  U_EXPORT2
 373 UCNV_TO_U_CALLBACK_SKIP (
 374                  const void *context,
 375                  UConverterToUnicodeArgs *toArgs,
 376                  const char* codeUnits,
 377                  int32_t length,
 378                  UConverterCallbackReason reason,
 379                  UErrorCode * err)
 380 {
 381     (void)toArgs;
 382     (void)codeUnits;
 383     (void)length;
 384     if (reason <= UCNV_IRREGULAR)
 385     {
 386         if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
 387         {
 388             *err = U_ZERO_ERROR;
 389         }
 390         /* else the caller must have set the error code accordingly. */
 391     }
 392     /* else ignore the reset, close and clone calls. */
 393 }
 394
 395 U_CAPI void    U_EXPORT2
 396 UCNV_TO_U_CALLBACK_SUBSTITUTE (
 397                  const void *context,
 398                  UConverterToUnicodeArgs *toArgs,
 399                  const char* codeUnits,
 400                  int32_t length,
 401                  UConverterCallbackReason reason,
 402                  UErrorCode * err)
 403 {
 404     (void)codeUnits;
 405     (void)length;
 406     if (reason <= UCNV_IRREGULAR)
 407     {
 408         if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
 409         {
 410             *err = U_ZERO_ERROR;
 411             ucnv_cbToUWriteSub(toArgs,0,err);
 412         }
 413         /* else the caller must have set the error code accordingly. */
 414     }
 415     /* else ignore the reset, close and clone calls. */
 416 }
 417
 418 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
 419  *and uses that as the substitution sequence
 420  */
 421 U_CAPI void   U_EXPORT2
 422 UCNV_TO_U_CALLBACK_ESCAPE (
 423                  const void *context,
 424                  UConverterToUnicodeArgs *toArgs,
 425                  const char* codeUnits,
 426                  int32_t length,
 427                  UConverterCallbackReason reason,
 428                  UErrorCode * err)
 429 {
 430     UChar uniValueString[VALUE_STRING_LENGTH];
 431     int32_t valueStringLength = 0;
 432     int32_t i = 0;
 433
 434     if (reason > UCNV_IRREGULAR)
 435     {
 436         return;
 437     }
 438
 439     if(context==NULL)
 440     {
 441         while (i < length)
 442         {
 443             uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
 444             uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT;    /* adding X */
 445             valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
 446         }
 447     }
 448     else
 449     {
 450         switch(*((char*)context))
 451         {
 452         case UCNV_PRV_ESCAPE_XML_DEC:
 453             while (i < length)
 454             {
 455                 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
 456                 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
 457                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
 458                 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
 459             }
 460             break;
 461
 462         case UCNV_PRV_ESCAPE_XML_HEX:
 463             while (i < length)
 464             {
 465                 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
 466                 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
 467                 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
 468                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
 469                 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
 470             }
 471             break;
 472         case UCNV_PRV_ESCAPE_C:
 473             while (i < length)
 474             {
 475                 uniValueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
 476                 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
 477                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
 478             }
 479             break;
 480         default:
 481             while (i < length)
 482             {
 483                 uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
 484                 uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT;    /* adding X */
 485                 uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
 486                 valueStringLength += 2;
 487             }
 488         }
 489     }
 490     /* reset the error */
 491     *err = U_ZERO_ERROR;
 492
 493     ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
 494 }
 495
 496 #endif