/**
* MungeInput - after getting a set of raw input data to be analyzed, preprocess
-* it by removing what appears to be html markup.
+* it by removing what appears to be html markup. Currently only used
+* by CharsetDetector::detectAll.
*
* @internal
*/
int dsti = 0;
uint8_t b;
bool inMarkup = FALSE;
+ bool inCSSDecl = FALSE;
int32_t openTags = 0;
int32_t badTags = 0;
for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
b = fRawInput[srci];
- if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
+ if ((b == (uint8_t)0x3C) && !inCSSDecl) { /* Check for the ASCII '<' */
if (inMarkup) {
badTags += 1;
}
-
inMarkup = TRUE;
openTags += 1;
}
- if (! inMarkup) {
+ if ((b == (uint8_t)0x7B) && !inMarkup) { /* Check for the ASCII '{' */
+ if (inCSSDecl) {
+ badTags += 1;
+ }
+ inCSSDecl = TRUE;
+ openTags += 1;
+ }
+
+ if (!inMarkup && !inCSSDecl) {
fInputBytes[dsti++] = b;
}
if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
inMarkup = FALSE;
}
+ if (b == (uint8_t)0x7D) { /* Check for the ASCII '}' */
+ inCSSDecl = FALSE;
+ }
}
fInputLen = dsti;
* Test whether input filtering is enabled for this charset detector.
* Input filtering removes text that appears to be HTML or xml
* markup from the input before applying the code page detection
- * heuristics.
+ * heuristics. Apple addition per <rdar://problem/48093252>: Will also
+ * remove text that appears to be CSS declaration blocks.
*
* @param ucsd The charset detector to check.
* @return TRUE if filtering is enabled.
* Enable filtering of input text. If filtering is enabled,
* text within angle brackets ("<" and ">") will be removed
* before detection, which will remove most HTML or xml markup.
+ * Apple addition per <rdar://problem/48093252>: Will also
+ * remove text between '{' and '}', e.g. CSS declaration blocks.
*
* @param ucsd the charset detector to be modified.
* @param filter <code>true</code> to enable input text filtering.
static void TestParseCurrPatternWithDecStyle(void);
static void TestParseCases(void);
static void TestFormatPrecision(void);
+static void TestSciNotationRound(void); // Apple <rdar://problem/49159521>
#define TESTCASE(x) addTest(root, &x, "tsformat/cnumtst/" #x)
TESTCASE(TestParseCurrPatternWithDecStyle);
TESTCASE(TestParseCases);
TESTCASE(TestFormatPrecision);
+ TESTCASE(TestSciNotationRound);
}
/* test Parse int 64 */
}
}
+// Currently Apple only for <rdar://problem/49159521>
+enum { kBBufMax = 128 };
+static const UChar* pat1 = u"#.##E+00;-#.##E+00";
+static void TestSciNotationRound(void) {
+ UErrorCode status = U_ZERO_ERROR;
+ UNumberFormat* unum = unum_open(UNUM_PATTERN_DECIMAL, NULL, 0, "en_US", NULL, &status);
+ if ( U_FAILURE(status) ) {
+ log_data_err("unum_open UNUM_PATTERN_DECIMAL with null pattern for \"en_US\" fails with %s\n", u_errorName(status));
+ } else {
+ unum_applyPattern(unum, FALSE, pat1, u_strlen(pat1), NULL, &status);
+ if ( U_FAILURE(status) ) {
+ log_err("unum_applyPattern fails with %s\n", u_errorName(status));
+ } else {
+ double value;
+ UChar ubuf[kUBufMax];
+ char bbuf[kBBufMax];
+ int32_t ulen;
+
+ unum_setAttribute(unum, UNUM_ROUNDING_MODE, UNUM_ROUND_HALFUP);
+ unum_setAttribute(unum, UNUM_MIN_FRACTION_DIGITS, 0);
+ unum_setAttribute(unum, UNUM_MAX_FRACTION_DIGITS, 50); // problem happens at 15 or more
+
+ for (value = 10000000000000000000000.0; value < 1000000000000000000000000000000000000000.0; value *= 10.0) {
+ status = U_ZERO_ERROR;
+ ulen = unum_formatDouble(unum, value, ubuf, kUBufMax, NULL, &status);
+ if ( U_FAILURE(status) ) {
+ printf("unum_formatDouble value %.1f status %s\n", value, u_errorName(status));
+ } else if (u_strncmp(ubuf,u"1E+",3) != 0) {
+ u_strToUTF8(bbuf, kBBufMax, NULL, ubuf, ulen, &status);
+ log_err("unum_formatDouble value %.1f expected result to begin with 1E+, got %s\n", value, bbuf);
+ }
+ }
+ }
+ unum_close(unum);
+ }
+}
+
#endif /* #if !UCONFIG_NO_FORMATTING */