]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /* |
2 | ********************************************************************** | |
3 | * Copyright (C) 2005-2006, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | ********************************************************************** | |
6 | */ | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | ||
10 | #if !UCONFIG_NO_CONVERSION | |
11 | ||
12 | #include "csrucode.h" | |
13 | ||
14 | U_NAMESPACE_BEGIN | |
15 | ||
16 | CharsetRecog_Unicode::~CharsetRecog_Unicode() | |
17 | { | |
18 | // nothing to do | |
19 | } | |
20 | ||
21 | CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE() | |
22 | { | |
23 | // nothing to do | |
24 | } | |
25 | ||
26 | const char *CharsetRecog_UTF_16_BE::getName() const | |
27 | { | |
28 | return "UTF-16BE"; | |
29 | } | |
30 | ||
31 | int32_t CharsetRecog_UTF_16_BE::match(InputText* textIn) | |
32 | { | |
33 | const uint8_t *input = textIn->fRawInput; | |
34 | ||
35 | if (input[0] == 0xFE && input[1] == 0xFF) { | |
36 | return 100; | |
37 | } | |
38 | ||
39 | // TODO: Do some statastics to check for unsigned UTF-16BE | |
40 | return 0; | |
41 | } | |
42 | ||
43 | CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE() | |
44 | { | |
45 | // nothing to do | |
46 | } | |
47 | ||
48 | const char *CharsetRecog_UTF_16_LE::getName() const | |
49 | { | |
50 | return "UTF-16LE"; | |
51 | } | |
52 | ||
53 | int32_t CharsetRecog_UTF_16_LE::match(InputText* textIn) | |
54 | { | |
55 | const uint8_t *input = textIn->fRawInput; | |
56 | ||
57 | if (input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) { | |
58 | return 100; | |
59 | } | |
60 | ||
61 | // TODO: Do some statastics to check for unsigned UTF-16LE | |
62 | return 0; | |
63 | } | |
64 | ||
65 | CharsetRecog_UTF_32::~CharsetRecog_UTF_32() | |
66 | { | |
67 | // nothing to do | |
68 | } | |
69 | ||
70 | int32_t CharsetRecog_UTF_32::match(InputText* textIn) | |
71 | { | |
72 | const uint8_t *input = textIn->fRawInput; | |
73 | int32_t limit = (textIn->fRawLength / 4) * 4; | |
74 | int32_t numValid = 0; | |
75 | int32_t numInvalid = 0; | |
76 | bool hasBOM = FALSE; | |
77 | int32_t confidence = 0; | |
78 | ||
79 | if (getChar(input, 0) == 0x0000FEFFUL) { | |
80 | hasBOM = TRUE; | |
81 | } | |
82 | ||
83 | for(int32_t i = 0; i < limit; i += 4) { | |
84 | int32_t ch = getChar(input, i); | |
85 | ||
86 | if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) { | |
87 | numInvalid += 1; | |
88 | } else { | |
89 | numValid += 1; | |
90 | } | |
91 | } | |
92 | ||
93 | ||
94 | // Cook up some sort of confidence score, based on presense of a BOM | |
95 | // and the existence of valid and/or invalid multi-byte sequences. | |
96 | if (hasBOM && numInvalid==0) { | |
97 | confidence = 100; | |
98 | } else if (hasBOM && numValid > numInvalid*10) { | |
99 | confidence = 80; | |
100 | } else if (numValid > 3 && numInvalid == 0) { | |
101 | confidence = 100; | |
102 | } else if (numValid > 0 && numInvalid == 0) { | |
103 | confidence = 80; | |
104 | } else if (numValid > numInvalid*10) { | |
105 | // Probably corruput UTF-32BE data. Valid sequences aren't likely by chance. | |
106 | confidence = 25; | |
107 | } | |
108 | ||
109 | return confidence; | |
110 | } | |
111 | ||
112 | CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE() | |
113 | { | |
114 | // nothing to do | |
115 | } | |
116 | ||
117 | const char *CharsetRecog_UTF_32_BE::getName() const | |
118 | { | |
119 | return "UTF-32BE"; | |
120 | } | |
121 | ||
122 | int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) const | |
123 | { | |
124 | return input[index + 0] << 24 | input[index + 1] << 16 | | |
125 | input[index + 2] << 8 | input[index + 3]; | |
126 | } | |
127 | ||
128 | CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE() | |
129 | { | |
130 | // nothing to do | |
131 | } | |
132 | ||
133 | const char *CharsetRecog_UTF_32_LE::getName() const | |
134 | { | |
135 | return "UTF-32LE"; | |
136 | } | |
137 | ||
138 | int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) const | |
139 | { | |
140 | return input[index + 3] << 24 | input[index + 2] << 16 | | |
141 | input[index + 1] << 8 | input[index + 0]; | |
142 | } | |
143 | ||
144 | U_NAMESPACE_END | |
145 | #endif | |
146 |