]>
git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csrucode.cpp
2 **********************************************************************
3 * Copyright (C) 2005-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_CONVERSION
17 CharsetRecog_Unicode::~CharsetRecog_Unicode()
22 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()
27 const char *CharsetRecog_UTF_16_BE::getName() const
32 UBool
CharsetRecog_UTF_16_BE::match(InputText
* textIn
, CharsetMatch
*results
) const
34 const uint8_t *input
= textIn
->fRawInput
;
35 int32_t confidence
= 0;
37 if (input
[0] == 0xFE && input
[1] == 0xFF) {
41 // TODO: Do some statastics to check for unsigned UTF-16BE
42 results
->set(textIn
, this, confidence
);
43 return (confidence
> 0);
46 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()
51 const char *CharsetRecog_UTF_16_LE::getName() const
56 UBool
CharsetRecog_UTF_16_LE::match(InputText
* textIn
, CharsetMatch
*results
) const
58 const uint8_t *input
= textIn
->fRawInput
;
59 int32_t confidence
= 0;
61 if (input
[0] == 0xFF && input
[1] == 0xFE && (input
[2] != 0x00 || input
[3] != 0x00)) {
65 // TODO: Do some statastics to check for unsigned UTF-16LE
66 results
->set(textIn
, this, confidence
);
67 return (confidence
> 0);
70 CharsetRecog_UTF_32::~CharsetRecog_UTF_32()
75 UBool
CharsetRecog_UTF_32::match(InputText
* textIn
, CharsetMatch
*results
) const
77 const uint8_t *input
= textIn
->fRawInput
;
78 int32_t limit
= (textIn
->fRawLength
/ 4) * 4;
80 int32_t numInvalid
= 0;
82 int32_t confidence
= 0;
84 if (getChar(input
, 0) == 0x0000FEFFUL
) {
88 for(int32_t i
= 0; i
< limit
; i
+= 4) {
89 int32_t ch
= getChar(input
, i
);
91 if (ch
< 0 || ch
>= 0x10FFFF || (ch
>= 0xD800 && ch
<= 0xDFFF)) {
99 // Cook up some sort of confidence score, based on presense of a BOM
100 // and the existence of valid and/or invalid multi-byte sequences.
101 if (hasBOM
&& numInvalid
==0) {
103 } else if (hasBOM
&& numValid
> numInvalid
*10) {
105 } else if (numValid
> 3 && numInvalid
== 0) {
107 } else if (numValid
> 0 && numInvalid
== 0) {
109 } else if (numValid
> numInvalid
*10) {
110 // Probably corruput UTF-32BE data. Valid sequences aren't likely by chance.
114 results
->set(textIn
, this, confidence
);
115 return (confidence
> 0);
118 CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE()
123 const char *CharsetRecog_UTF_32_BE::getName() const
128 int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input
, int32_t index
) const
130 return input
[index
+ 0] << 24 | input
[index
+ 1] << 16 |
131 input
[index
+ 2] << 8 | input
[index
+ 3];
134 CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE()
139 const char *CharsetRecog_UTF_32_LE::getName() const
144 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input
, int32_t index
) const
146 return input
[index
+ 3] << 24 | input
[index
+ 2] << 16 |
147 input
[index
+ 1] << 8 | input
[index
+ 0];