]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/inputext.cpp
ICU-64260.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / inputext.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
73c04bcf
A
3/*
4 **********************************************************************
2ca993e8 5 * Copyright (C) 2005-2016, International Business Machines
73c04bcf
A
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_CONVERSION
13
14#include "inputext.h"
15
16#include "cmemory.h"
17#include "cstring.h"
18
19#include <string.h>
20
21U_NAMESPACE_BEGIN
22
23#define BUFFER_SIZE 8192
24
73c04bcf
A
25#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
26#define DELETE_ARRAY(array) uprv_free((void *) (array))
27
46f4442e 28InputText::InputText(UErrorCode &status)
73c04bcf
A
29 : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been
30 // removed if appropriate.
31 fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.
32 // Value is percent, not absolute.
33 fDeclaredEncoding(0),
34 fRawInput(0),
35 fRawLength(0)
46f4442e
A
36{
37 if (fInputBytes == NULL || fByteStats == NULL) {
38 status = U_MEMORY_ALLOCATION_ERROR;
39 }
73c04bcf
A
40}
41
42InputText::~InputText()
43{
44 DELETE_ARRAY(fDeclaredEncoding);
45 DELETE_ARRAY(fByteStats);
46 DELETE_ARRAY(fInputBytes);
47}
48
49void InputText::setText(const char *in, int32_t len)
50{
51 fInputLen = 0;
52 fC1Bytes = FALSE;
1a147d09 53 fOnlyTypicalASCII = FALSE; // rdar://56373519
73c04bcf 54 fRawInput = (const uint8_t *) in;
729e4ab9 55 fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
73c04bcf
A
56}
57
58void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
59{
60 if(encoding) {
61 if (len == -1) {
729e4ab9 62 len = (int32_t)uprv_strlen(encoding);
73c04bcf
A
63 }
64
65 len += 1; // to make place for the \0 at the end.
66 uprv_free(fDeclaredEncoding);
67 fDeclaredEncoding = NEW_ARRAY(char, len);
68 uprv_strncpy(fDeclaredEncoding, encoding, len);
69 }
70}
71
72UBool InputText::isSet() const
73{
74 return fRawInput != NULL;
75}
76
77/**
78* MungeInput - after getting a set of raw input data to be analyzed, preprocess
249c4c5e
A
79* it by removing what appears to be html markup. Currently only used
80* by CharsetDetector::detectAll.
73c04bcf
A
81*
82* @internal
83*/
84void InputText::MungeInput(UBool fStripTags) {
85 int srci = 0;
86 int dsti = 0;
87 uint8_t b;
88 bool inMarkup = FALSE;
249c4c5e 89 bool inCSSDecl = FALSE;
73c04bcf
A
90 int32_t openTags = 0;
91 int32_t badTags = 0;
92
93 //
94 // html / xml markup stripping.
95 // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
96 // discard everything within < brackets >
97 // Count how many total '<' and illegal (nested) '<' occur, so we can make some
98 // guess as to whether the input was actually marked up at all.
99 // TODO: Think about how this interacts with EBCDIC charsets that are detected.
100 if (fStripTags) {
101 for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
102 b = fRawInput[srci];
103
249c4c5e 104 if ((b == (uint8_t)0x3C) && !inCSSDecl) { /* Check for the ASCII '<' */
73c04bcf
A
105 if (inMarkup) {
106 badTags += 1;
107 }
73c04bcf
A
108 inMarkup = TRUE;
109 openTags += 1;
110 }
111
249c4c5e
A
112 if ((b == (uint8_t)0x7B) && !inMarkup) { /* Check for the ASCII '{' */
113 if (inCSSDecl) {
114 badTags += 1;
115 }
116 inCSSDecl = TRUE;
117 openTags += 1;
118 }
119
120 if (!inMarkup && !inCSSDecl) {
73c04bcf
A
121 fInputBytes[dsti++] = b;
122 }
123
124 if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
125 inMarkup = FALSE;
126 }
249c4c5e
A
127 if (b == (uint8_t)0x7D) { /* Check for the ASCII '}' */
128 inCSSDecl = FALSE;
129 }
73c04bcf
A
130 }
131
132 fInputLen = dsti;
133 }
134
135 //
136 // If it looks like this input wasn't marked up, or if it looks like it's
137 // essentially nothing but markup abandon the markup stripping.
138 // Detection will have to work on the unstripped input.
139 //
140 if (openTags<5 || openTags/5 < badTags ||
141 (fInputLen < 100 && fRawLength>600))
142 {
143 int32_t limit = fRawLength;
144
145 if (limit > BUFFER_SIZE) {
146 limit = BUFFER_SIZE;
147 }
148
149 for (srci=0; srci<limit; srci++) {
150 fInputBytes[srci] = fRawInput[srci];
151 }
152
153 fInputLen = srci;
154 }
155
156 //
157 // Tally up the byte occurence statistics.
158 // These are available for use by the various detectors.
159 //
160
161 uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
162
163 for (srci = 0; srci < fInputLen; srci += 1) {
164 fByteStats[fInputBytes[srci]] += 1;
165 }
166
1a147d09
A
167 fOnlyTypicalASCII = TRUE; // rdar://56373519
168 for (int32_t i = 0x01; i <= 0xFF; i += 1) {
73c04bcf 169 if (fByteStats[i] != 0) {
1a147d09
A
170 if ((i < 0x20 && i != 0x09 && i != 0x0A && i != 0x0D) || i > 0x7E) {
171 fOnlyTypicalASCII = FALSE; // rdar://56373519
172 if (i >= 0x80 && i <= 0x9F) {
173 fC1Bytes = TRUE;
174 }
175 }
73c04bcf
A
176 }
177 }
1a147d09
A
178 if (fByteStats[0] > 1) {
179 fOnlyTypicalASCII = FALSE;
180 }
73c04bcf
A
181}
182
183U_NAMESPACE_END
184#endif
185