X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/57a6839dcb3bba09e8228b822b290604668416fe..HEAD:/icuSources/i18n/csrsbcs.cpp diff --git a/icuSources/i18n/csrsbcs.cpp b/icuSources/i18n/csrsbcs.cpp index d03367cc..c3819585 100644 --- a/icuSources/i18n/csrsbcs.cpp +++ b/icuSources/i18n/csrsbcs.cpp @@ -1,6 +1,8 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** - * Copyright (C) 2005-2013, International Business Machines + * Copyright (C) 2005-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -15,7 +17,6 @@ #define N_GRAM_SIZE 3 #define N_GRAM_MASK 0xFFFFFF -#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) U_NAMESPACE_BEGIN @@ -28,6 +29,10 @@ NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap) ngramCount = hitCount = 0; } +NGramParser::~NGramParser() +{ +} + /* * Binary search for value in table, which must have exactly 64 entries. */ @@ -137,6 +142,7 @@ int32_t NGramParser::parse(InputText *det) return (int32_t) (rawPercent * 300.0); } +#if !UCONFIG_ONLY_HTML_CONVERSION static const uint8_t unshapeMap_IBM420[] = { /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, @@ -162,6 +168,7 @@ NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_ alef = 0x00; } +NGramParser_IBM420::~NGramParser_IBM420() {} int32_t NGramParser_IBM420::isLamAlef(int32_t b) { @@ -232,6 +239,7 @@ void NGramParser_IBM420::parseCharacters(InputText *det) } } } +#endif CharsetRecog_sbcs::CharsetRecog_sbcs() { @@ -624,6 +632,7 @@ static const uint8_t charMap_KOI8_R[] = { 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, }; +#if !UCONFIG_ONLY_HTML_CONVERSION static const int32_t ngrams_IBM424_he_rtl[] = { 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641, 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045, @@ -691,6 +700,7 @@ static const uint8_t charMap_IBM420_ar[]= { /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF, /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40, }; +#endif //ISO-8859-1,2,5,6,7,8,9 Ngrams @@ -883,7 +893,7 @@ UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const const char *name = textIn->fC1Bytes? "windows-1252" : "ISO-8859-1"; uint32_t i; int32_t bestConfidenceSoFar = -1; - for (i=0; i < ARRAY_SIZE(ngrams_8859_1) ; i++) { + for (i=0; i < UPRV_LENGTHOF(ngrams_8859_1) ; i++) { const int32_t *ngrams = ngrams_8859_1[i].ngrams; const char *lang = ngrams_8859_1[i].lang; int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_1); @@ -892,6 +902,10 @@ UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const bestConfidenceSoFar = confidence; } } + if (bestConfidenceSoFar < 10 && textIn->fOnlyTypicalASCII) { // rdar://56373519 + bestConfidenceSoFar = 15; + results->set(textIn, this, bestConfidenceSoFar, name); + } return (bestConfidenceSoFar > 0); } @@ -910,7 +924,7 @@ UBool CharsetRecog_8859_2::match(InputText *textIn, CharsetMatch *results) const const char *name = textIn->fC1Bytes? "windows-1250" : "ISO-8859-2"; uint32_t i; int32_t bestConfidenceSoFar = -1; - for (i=0; i < ARRAY_SIZE(ngrams_8859_2) ; i++) { + for (i=0; i < UPRV_LENGTHOF(ngrams_8859_2) ; i++) { const int32_t *ngrams = ngrams_8859_2[i].ngrams; const char *lang = ngrams_8859_2[i].lang; int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_2); @@ -1155,6 +1169,7 @@ UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const return (confidence > 0); } +#if !UCONFIG_ONLY_HTML_CONVERSION CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he() { // nothing to do @@ -1253,6 +1268,7 @@ UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results results->set(textIn, this, confidence); return (confidence > 0); } +#endif U_NAMESPACE_END #endif