1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2005-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_CONVERSION
23 #define BUFFER_SIZE 8192
25 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
26 #define DELETE_ARRAY(array) uprv_free((void *) (array))
28 InputText::InputText(UErrorCode
&status
)
29 : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE
)), // The text to be checked. Markup will have been
30 // removed if appropriate.
31 fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.
32 // Value is percent, not absolute.
37 if (fInputBytes
== NULL
|| fByteStats
== NULL
) {
38 status
= U_MEMORY_ALLOCATION_ERROR
;
42 InputText::~InputText()
44 DELETE_ARRAY(fDeclaredEncoding
);
45 DELETE_ARRAY(fByteStats
);
46 DELETE_ARRAY(fInputBytes
);
49 void InputText::setText(const char *in
, int32_t len
)
53 fRawInput
= (const uint8_t *) in
;
54 fRawLength
= len
== -1? (int32_t)uprv_strlen(in
) : len
;
57 void InputText::setDeclaredEncoding(const char* encoding
, int32_t len
)
61 len
= (int32_t)uprv_strlen(encoding
);
64 len
+= 1; // to make place for the \0 at the end.
65 uprv_free(fDeclaredEncoding
);
66 fDeclaredEncoding
= NEW_ARRAY(char, len
);
67 uprv_strncpy(fDeclaredEncoding
, encoding
, len
);
71 UBool
InputText::isSet() const
73 return fRawInput
!= NULL
;
77 * MungeInput - after getting a set of raw input data to be analyzed, preprocess
78 * it by removing what appears to be html markup. Currently only used
79 * by CharsetDetector::detectAll.
83 void InputText::MungeInput(UBool fStripTags
) {
87 bool inMarkup
= FALSE
;
88 bool inCSSDecl
= FALSE
;
93 // html / xml markup stripping.
94 // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
95 // discard everything within < brackets >
96 // Count how many total '<' and illegal (nested) '<' occur, so we can make some
97 // guess as to whether the input was actually marked up at all.
98 // TODO: Think about how this interacts with EBCDIC charsets that are detected.
100 for (srci
= 0; srci
< fRawLength
&& dsti
< BUFFER_SIZE
; srci
+= 1) {
103 if ((b
== (uint8_t)0x3C) && !inCSSDecl
) { /* Check for the ASCII '<' */
111 if ((b
== (uint8_t)0x7B) && !inMarkup
) { /* Check for the ASCII '{' */
119 if (!inMarkup
&& !inCSSDecl
) {
120 fInputBytes
[dsti
++] = b
;
123 if (b
== (uint8_t)0x3E) { /* Check for the ASCII '>' */
126 if (b
== (uint8_t)0x7D) { /* Check for the ASCII '}' */
135 // If it looks like this input wasn't marked up, or if it looks like it's
136 // essentially nothing but markup abandon the markup stripping.
137 // Detection will have to work on the unstripped input.
139 if (openTags
<5 || openTags
/5 < badTags
||
140 (fInputLen
< 100 && fRawLength
>600))
142 int32_t limit
= fRawLength
;
144 if (limit
> BUFFER_SIZE
) {
148 for (srci
=0; srci
<limit
; srci
++) {
149 fInputBytes
[srci
] = fRawInput
[srci
];
156 // Tally up the byte occurence statistics.
157 // These are available for use by the various detectors.
160 uprv_memset(fByteStats
, 0, (sizeof fByteStats
[0]) * 256);
162 for (srci
= 0; srci
< fInputLen
; srci
+= 1) {
163 fByteStats
[fInputBytes
[srci
]] += 1;
166 for (int32_t i
= 0x80; i
<= 0x9F; i
+= 1) {
167 if (fByteStats
[i
] != 0) {