]>
git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csrucode.cpp
   1 // © 2016 and later: Unicode, Inc. and others. 
   2 // License & terms of use: http://www.unicode.org/copyright.html 
   4  ********************************************************************** 
   5  *   Copyright (C) 2005-2013, International Business Machines 
   6  *   Corporation and others.  All Rights Reserved. 
   7  ********************************************************************** 
  10 #include "unicode/utypes.h" 
  12 #if !UCONFIG_NO_CONVERSION 
  19 CharsetRecog_Unicode::~CharsetRecog_Unicode() 
  24 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE() 
  29 const char *CharsetRecog_UTF_16_BE::getName() const 
  34 // UTF-16 confidence calculation. Very simple minded, but better than nothing. 
  35 //   Any 8 bit non-control characters bump the confidence up. These have a zero high byte, 
  36 //     and are very likely to be UTF-16, although they could also be part of a UTF-32 code. 
  37 //   NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32. 
  38 //   NULs should be rare in actual text.  
  40 static int32_t adjustConfidence(UChar codeUnit
, int32_t confidence
) { 
  43     } else if ((codeUnit 
>= 0x20 && codeUnit 
<= 0xff) || codeUnit 
== 0x0a) { 
  48     } else if (confidence 
> 100) { 
  55 UBool 
CharsetRecog_UTF_16_BE::match(InputText
* textIn
, CharsetMatch 
*results
) const 
  57     const uint8_t *input 
= textIn
->fRawInput
; 
  58     int32_t confidence 
= 10; 
  59     int32_t length 
= textIn
->fRawLength
; 
  61     int32_t bytesToCheck 
= (length 
> 30) ? 30 : length
; 
  62     for (int32_t charIndex
=0; charIndex
<bytesToCheck
-1; charIndex
+=2) { 
  63         UChar codeUnit 
= (input
[charIndex
] << 8) | input
[charIndex 
+ 1]; 
  64         if (charIndex 
== 0 && codeUnit 
== 0xFEFF) { 
  68         confidence 
= adjustConfidence(codeUnit
, confidence
); 
  69         if (confidence 
== 0 || confidence 
== 100) { 
  73     if (bytesToCheck 
< 4 && confidence 
< 100) { 
  76     results
->set(textIn
, this, confidence
); 
  77     return (confidence 
> 0); 
  80 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE() 
  85 const char *CharsetRecog_UTF_16_LE::getName() const 
  90 UBool 
CharsetRecog_UTF_16_LE::match(InputText
* textIn
, CharsetMatch 
*results
) const 
  92     const uint8_t *input 
= textIn
->fRawInput
; 
  93     int32_t confidence 
= 10; 
  94     int32_t length 
= textIn
->fRawLength
; 
  96     int32_t bytesToCheck 
= (length 
> 30) ? 30 : length
; 
  97     for (int32_t charIndex
=0; charIndex
<bytesToCheck
-1; charIndex
+=2) { 
  98         UChar codeUnit 
= input
[charIndex
] | (input
[charIndex 
+ 1] << 8); 
  99         if (charIndex 
== 0 && codeUnit 
== 0xFEFF) { 
 100             confidence 
= 100;     // UTF-16 BOM 
 101             if (length 
>= 4 && input
[2] == 0 && input
[3] == 0) { 
 102                 confidence 
= 0;   // UTF-32 BOM 
 106         confidence 
= adjustConfidence(codeUnit
, confidence
); 
 107         if (confidence 
== 0 || confidence 
== 100) { 
 111     if (bytesToCheck 
< 4 && confidence 
< 100) { 
 114     results
->set(textIn
, this, confidence
); 
 115     return (confidence 
> 0); 
 118 CharsetRecog_UTF_32::~CharsetRecog_UTF_32() 
 123 UBool 
CharsetRecog_UTF_32::match(InputText
* textIn
, CharsetMatch 
*results
) const 
 125     const uint8_t *input 
= textIn
->fRawInput
; 
 126     int32_t limit 
= (textIn
->fRawLength 
/ 4) * 4; 
 127     int32_t numValid 
= 0; 
 128     int32_t numInvalid 
= 0; 
 130     int32_t confidence 
= 0; 
 132     if (limit 
> 0 && getChar(input
, 0) == 0x0000FEFFUL
) { 
 136     for(int32_t i 
= 0; i 
< limit
; i 
+= 4) { 
 137         int32_t ch 
= getChar(input
, i
); 
 139         if (ch 
< 0 || ch 
>= 0x10FFFF || (ch 
>= 0xD800 && ch 
<= 0xDFFF)) { 
 147     // Cook up some sort of confidence score, based on presense of a BOM 
 148     //    and the existence of valid and/or invalid multi-byte sequences. 
 149     if (hasBOM 
&& numInvalid
==0) { 
 151     } else if (hasBOM 
&& numValid 
> numInvalid
*10) { 
 153     } else if (numValid 
> 3 && numInvalid 
== 0) { 
 155     } else if (numValid 
> 0 && numInvalid 
== 0) { 
 157     } else if (numValid 
> numInvalid
*10) { 
 158         // Probably corruput UTF-32BE data.  Valid sequences aren't likely by chance. 
 162     results
->set(textIn
, this, confidence
); 
 163     return (confidence 
> 0); 
 166 CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE() 
 171 const char *CharsetRecog_UTF_32_BE::getName() const 
 176 int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input
, int32_t index
) const 
 178     return input
[index 
+ 0] << 24 | input
[index 
+ 1] << 16 | 
 179            input
[index 
+ 2] <<  8 | input
[index 
+ 3]; 
 182 CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE() 
 187 const char *CharsetRecog_UTF_32_LE::getName() const 
 192 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input
, int32_t index
) const 
 194     return input
[index 
+ 3] << 24 | input
[index 
+ 2] << 16 | 
 195            input
[index 
+ 1] <<  8 | input
[index 
+ 0];