]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
73c04bcf A |
3 | /* |
4 | ********************************************************************** | |
2ca993e8 | 5 | * Copyright (C) 2005-2016, International Business Machines |
73c04bcf A |
6 | * Corporation and others. All Rights Reserved. |
7 | ********************************************************************** | |
8 | */ | |
9 | ||
10 | #include "unicode/utypes.h" | |
11 | ||
12 | #if !UCONFIG_NO_CONVERSION | |
13 | ||
14 | #include "inputext.h" | |
15 | ||
16 | #include "cmemory.h" | |
17 | #include "cstring.h" | |
18 | ||
19 | #include <string.h> | |
20 | ||
21 | U_NAMESPACE_BEGIN | |
22 | ||
23 | #define BUFFER_SIZE 8192 | |
24 | ||
73c04bcf A |
25 | #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) |
26 | #define DELETE_ARRAY(array) uprv_free((void *) (array)) | |
27 | ||
46f4442e | 28 | InputText::InputText(UErrorCode &status) |
73c04bcf A |
29 | : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been |
30 | // removed if appropriate. | |
31 | fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text. | |
32 | // Value is percent, not absolute. | |
33 | fDeclaredEncoding(0), | |
34 | fRawInput(0), | |
35 | fRawLength(0) | |
46f4442e A |
36 | { |
37 | if (fInputBytes == NULL || fByteStats == NULL) { | |
38 | status = U_MEMORY_ALLOCATION_ERROR; | |
39 | } | |
73c04bcf A |
40 | } |
41 | ||
42 | InputText::~InputText() | |
43 | { | |
44 | DELETE_ARRAY(fDeclaredEncoding); | |
45 | DELETE_ARRAY(fByteStats); | |
46 | DELETE_ARRAY(fInputBytes); | |
47 | } | |
48 | ||
49 | void InputText::setText(const char *in, int32_t len) | |
50 | { | |
51 | fInputLen = 0; | |
52 | fC1Bytes = FALSE; | |
1a147d09 | 53 | fOnlyTypicalASCII = FALSE; // rdar://56373519 |
73c04bcf | 54 | fRawInput = (const uint8_t *) in; |
729e4ab9 | 55 | fRawLength = len == -1? (int32_t)uprv_strlen(in) : len; |
73c04bcf A |
56 | } |
57 | ||
58 | void InputText::setDeclaredEncoding(const char* encoding, int32_t len) | |
59 | { | |
60 | if(encoding) { | |
61 | if (len == -1) { | |
729e4ab9 | 62 | len = (int32_t)uprv_strlen(encoding); |
73c04bcf A |
63 | } |
64 | ||
65 | len += 1; // to make place for the \0 at the end. | |
66 | uprv_free(fDeclaredEncoding); | |
67 | fDeclaredEncoding = NEW_ARRAY(char, len); | |
68 | uprv_strncpy(fDeclaredEncoding, encoding, len); | |
69 | } | |
70 | } | |
71 | ||
72 | UBool InputText::isSet() const | |
73 | { | |
74 | return fRawInput != NULL; | |
75 | } | |
76 | ||
77 | /** | |
78 | * MungeInput - after getting a set of raw input data to be analyzed, preprocess | |
249c4c5e A |
79 | * it by removing what appears to be html markup. Currently only used |
80 | * by CharsetDetector::detectAll. | |
73c04bcf A |
81 | * |
82 | * @internal | |
83 | */ | |
84 | void InputText::MungeInput(UBool fStripTags) { | |
85 | int srci = 0; | |
86 | int dsti = 0; | |
87 | uint8_t b; | |
88 | bool inMarkup = FALSE; | |
249c4c5e | 89 | bool inCSSDecl = FALSE; |
73c04bcf A |
90 | int32_t openTags = 0; |
91 | int32_t badTags = 0; | |
92 | ||
93 | // | |
94 | // html / xml markup stripping. | |
95 | // quick and dirty, not 100% accurate, but hopefully good enough, statistically. | |
96 | // discard everything within < brackets > | |
97 | // Count how many total '<' and illegal (nested) '<' occur, so we can make some | |
98 | // guess as to whether the input was actually marked up at all. | |
99 | // TODO: Think about how this interacts with EBCDIC charsets that are detected. | |
100 | if (fStripTags) { | |
101 | for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { | |
102 | b = fRawInput[srci]; | |
103 | ||
249c4c5e | 104 | if ((b == (uint8_t)0x3C) && !inCSSDecl) { /* Check for the ASCII '<' */ |
73c04bcf A |
105 | if (inMarkup) { |
106 | badTags += 1; | |
107 | } | |
73c04bcf A |
108 | inMarkup = TRUE; |
109 | openTags += 1; | |
110 | } | |
111 | ||
249c4c5e A |
112 | if ((b == (uint8_t)0x7B) && !inMarkup) { /* Check for the ASCII '{' */ |
113 | if (inCSSDecl) { | |
114 | badTags += 1; | |
115 | } | |
116 | inCSSDecl = TRUE; | |
117 | openTags += 1; | |
118 | } | |
119 | ||
120 | if (!inMarkup && !inCSSDecl) { | |
73c04bcf A |
121 | fInputBytes[dsti++] = b; |
122 | } | |
123 | ||
124 | if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */ | |
125 | inMarkup = FALSE; | |
126 | } | |
249c4c5e A |
127 | if (b == (uint8_t)0x7D) { /* Check for the ASCII '}' */ |
128 | inCSSDecl = FALSE; | |
129 | } | |
73c04bcf A |
130 | } |
131 | ||
132 | fInputLen = dsti; | |
133 | } | |
134 | ||
135 | // | |
136 | // If it looks like this input wasn't marked up, or if it looks like it's | |
137 | // essentially nothing but markup abandon the markup stripping. | |
138 | // Detection will have to work on the unstripped input. | |
139 | // | |
140 | if (openTags<5 || openTags/5 < badTags || | |
141 | (fInputLen < 100 && fRawLength>600)) | |
142 | { | |
143 | int32_t limit = fRawLength; | |
144 | ||
145 | if (limit > BUFFER_SIZE) { | |
146 | limit = BUFFER_SIZE; | |
147 | } | |
148 | ||
149 | for (srci=0; srci<limit; srci++) { | |
150 | fInputBytes[srci] = fRawInput[srci]; | |
151 | } | |
152 | ||
153 | fInputLen = srci; | |
154 | } | |
155 | ||
156 | // | |
157 | // Tally up the byte occurence statistics. | |
158 | // These are available for use by the various detectors. | |
159 | // | |
160 | ||
161 | uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256); | |
162 | ||
163 | for (srci = 0; srci < fInputLen; srci += 1) { | |
164 | fByteStats[fInputBytes[srci]] += 1; | |
165 | } | |
166 | ||
1a147d09 A |
167 | fOnlyTypicalASCII = TRUE; // rdar://56373519 |
168 | for (int32_t i = 0x01; i <= 0xFF; i += 1) { | |
73c04bcf | 169 | if (fByteStats[i] != 0) { |
1a147d09 A |
170 | if ((i < 0x20 && i != 0x09 && i != 0x0A && i != 0x0D) || i > 0x7E) { |
171 | fOnlyTypicalASCII = FALSE; // rdar://56373519 | |
172 | if (i >= 0x80 && i <= 0x9F) { | |
173 | fC1Bytes = TRUE; | |
174 | } | |
175 | } | |
73c04bcf A |
176 | } |
177 | } | |
1a147d09 A |
178 | if (fByteStats[0] > 1) { |
179 | fOnlyTypicalASCII = FALSE; | |
180 | } | |
73c04bcf A |
181 | } |
182 | ||
183 | U_NAMESPACE_END | |
184 | #endif | |
185 |