]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/inputext.cpp
ICU-8.11.tar.gz
[apple/icu.git] / icuSources / i18n / inputext.cpp
CommitLineData
73c04bcf
A
1/*
2 **********************************************************************
3 * Copyright (C) 2005-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_CONVERSION
11
12#include "inputext.h"
13
14#include "cmemory.h"
15#include "cstring.h"
16
17#include <string.h>
18
19U_NAMESPACE_BEGIN
20
21#define BUFFER_SIZE 8192
22
23#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
24
25#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
26#define DELETE_ARRAY(array) uprv_free((void *) (array))
27
28InputText::InputText()
29 : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been
30 // removed if appropriate.
31 fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.
32 // Value is percent, not absolute.
33 fDeclaredEncoding(0),
34 fRawInput(0),
35 fRawLength(0)
36{
37
38}
39
40InputText::~InputText()
41{
42 DELETE_ARRAY(fDeclaredEncoding);
43 DELETE_ARRAY(fByteStats);
44 DELETE_ARRAY(fInputBytes);
45}
46
47void InputText::setText(const char *in, int32_t len)
48{
49 fInputLen = 0;
50 fC1Bytes = FALSE;
51 fRawInput = (const uint8_t *) in;
52 fRawLength = len == -1? uprv_strlen(in) : len;
53}
54
55void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
56{
57 if(encoding) {
58 if (len == -1) {
59 len = uprv_strlen(encoding);
60 }
61
62 len += 1; // to make place for the \0 at the end.
63 uprv_free(fDeclaredEncoding);
64 fDeclaredEncoding = NEW_ARRAY(char, len);
65 uprv_strncpy(fDeclaredEncoding, encoding, len);
66 }
67}
68
69UBool InputText::isSet() const
70{
71 return fRawInput != NULL;
72}
73
74/**
75* MungeInput - after getting a set of raw input data to be analyzed, preprocess
76* it by removing what appears to be html markup.
77*
78* @internal
79*/
80void InputText::MungeInput(UBool fStripTags) {
81 int srci = 0;
82 int dsti = 0;
83 uint8_t b;
84 bool inMarkup = FALSE;
85 int32_t openTags = 0;
86 int32_t badTags = 0;
87
88 //
89 // html / xml markup stripping.
90 // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
91 // discard everything within < brackets >
92 // Count how many total '<' and illegal (nested) '<' occur, so we can make some
93 // guess as to whether the input was actually marked up at all.
94 // TODO: Think about how this interacts with EBCDIC charsets that are detected.
95 if (fStripTags) {
96 for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
97 b = fRawInput[srci];
98
99 if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
100 if (inMarkup) {
101 badTags += 1;
102 }
103
104 inMarkup = TRUE;
105 openTags += 1;
106 }
107
108 if (! inMarkup) {
109 fInputBytes[dsti++] = b;
110 }
111
112 if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
113 inMarkup = FALSE;
114 }
115 }
116
117 fInputLen = dsti;
118 }
119
120 //
121 // If it looks like this input wasn't marked up, or if it looks like it's
122 // essentially nothing but markup abandon the markup stripping.
123 // Detection will have to work on the unstripped input.
124 //
125 if (openTags<5 || openTags/5 < badTags ||
126 (fInputLen < 100 && fRawLength>600))
127 {
128 int32_t limit = fRawLength;
129
130 if (limit > BUFFER_SIZE) {
131 limit = BUFFER_SIZE;
132 }
133
134 for (srci=0; srci<limit; srci++) {
135 fInputBytes[srci] = fRawInput[srci];
136 }
137
138 fInputLen = srci;
139 }
140
141 //
142 // Tally up the byte occurence statistics.
143 // These are available for use by the various detectors.
144 //
145
146 uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
147
148 for (srci = 0; srci < fInputLen; srci += 1) {
149 fByteStats[fInputBytes[srci]] += 1;
150 }
151
152 for (int32_t i = 0x80; i <= 0x9F; i += 1) {
153 if (fByteStats[i] != 0) {
154 fC1Bytes = TRUE;
155 break;
156 }
157 }
158}
159
160U_NAMESPACE_END
161#endif
162