]>
git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csrucode.cpp
2 **********************************************************************
3 * Copyright (C) 2005-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_CONVERSION
16 CharsetRecog_Unicode::~CharsetRecog_Unicode()
21 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()
26 const char *CharsetRecog_UTF_16_BE::getName() const
31 int32_t CharsetRecog_UTF_16_BE::match(InputText
* textIn
)
33 const uint8_t *input
= textIn
->fRawInput
;
35 if (input
[0] == 0xFE && input
[1] == 0xFF) {
39 // TODO: Do some statastics to check for unsigned UTF-16BE
43 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()
48 const char *CharsetRecog_UTF_16_LE::getName() const
53 int32_t CharsetRecog_UTF_16_LE::match(InputText
* textIn
)
55 const uint8_t *input
= textIn
->fRawInput
;
57 if (input
[0] == 0xFF && input
[1] == 0xFE && (input
[2] != 0x00 || input
[3] != 0x00)) {
61 // TODO: Do some statastics to check for unsigned UTF-16LE
65 CharsetRecog_UTF_32::~CharsetRecog_UTF_32()
70 int32_t CharsetRecog_UTF_32::match(InputText
* textIn
)
72 const uint8_t *input
= textIn
->fRawInput
;
73 int32_t limit
= (textIn
->fRawLength
/ 4) * 4;
75 int32_t numInvalid
= 0;
77 int32_t confidence
= 0;
79 if (getChar(input
, 0) == 0x0000FEFFUL
) {
83 for(int32_t i
= 0; i
< limit
; i
+= 4) {
84 int32_t ch
= getChar(input
, i
);
86 if (ch
< 0 || ch
>= 0x10FFFF || (ch
>= 0xD800 && ch
<= 0xDFFF)) {
94 // Cook up some sort of confidence score, based on presense of a BOM
95 // and the existence of valid and/or invalid multi-byte sequences.
96 if (hasBOM
&& numInvalid
==0) {
98 } else if (hasBOM
&& numValid
> numInvalid
*10) {
100 } else if (numValid
> 3 && numInvalid
== 0) {
102 } else if (numValid
> 0 && numInvalid
== 0) {
104 } else if (numValid
> numInvalid
*10) {
105 // Probably corruput UTF-32BE data. Valid sequences aren't likely by chance.
112 CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE()
117 const char *CharsetRecog_UTF_32_BE::getName() const
122 int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input
, int32_t index
) const
124 return input
[index
+ 0] << 24 | input
[index
+ 1] << 16 |
125 input
[index
+ 2] << 8 | input
[index
+ 3];
128 CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE()
133 const char *CharsetRecog_UTF_32_LE::getName() const
138 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input
, int32_t index
) const
140 return input
[index
+ 3] << 24 | input
[index
+ 2] << 16 |
141 input
[index
+ 1] << 8 | input
[index
+ 0];