X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/51004dcb01e06fef634b61be77ed73dd61cb6db9..HEAD:/icuSources/i18n/csrsbcs.cpp diff --git a/icuSources/i18n/csrsbcs.cpp b/icuSources/i18n/csrsbcs.cpp index 1aad70e3..c3819585 100644 --- a/icuSources/i18n/csrsbcs.cpp +++ b/icuSources/i18n/csrsbcs.cpp @@ -1,6 +1,8 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** - * Copyright (C) 2005-2012, International Business Machines + * Copyright (C) 2005-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -15,12 +17,11 @@ #define N_GRAM_SIZE 3 #define N_GRAM_MASK 0xFFFFFF -#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) U_NAMESPACE_BEGIN NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap) - :byteIndex(0), ngram(0) + : ngram(0), byteIndex(0) { ngramList = theNgramList; charMap = theCharMap; @@ -28,6 +29,10 @@ NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap) ngramCount = hitCount = 0; } +NGramParser::~NGramParser() +{ +} + /* * Binary search for value in table, which must have exactly 64 entries. */ @@ -96,7 +101,7 @@ int32_t NGramParser::nextByte(InputText *det) return det->fInputBytes[byteIndex++]; } -int32_t NGramParser::parse(InputText *det) +void NGramParser::parseCharacters(InputText *det) { int32_t b; bool ignoreSpace = FALSE; @@ -113,6 +118,11 @@ int32_t NGramParser::parse(InputText *det) ignoreSpace = (mb == 0x20); } } +} + +int32_t NGramParser::parse(InputText *det) +{ + parseCharacters(det); // TODO: Is this OK? The buffer could have ended in the middle of a word... addByte(0x20); @@ -132,6 +142,105 @@ int32_t NGramParser::parse(InputText *det) return (int32_t) (rawPercent * 300.0); } +#if !UCONFIG_ONLY_HTML_CONVERSION +static const uint8_t unshapeMap_IBM420[] = { +/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ +/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, +/* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, +/* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, +/* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, +/* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, +/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F, +/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E, +/* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF, +/* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF, +/* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF, +/* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF, +/* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, +/* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, +}; + +NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap):NGramParser(theNgramList, theCharMap) +{ + alef = 0x00; +} + +NGramParser_IBM420::~NGramParser_IBM420() {} + +int32_t NGramParser_IBM420::isLamAlef(int32_t b) +{ + if(b == 0xB2 || b == 0xB3){ + return 0x47; + }else if(b == 0xB4 || b == 0xB5){ + return 0x49; + }else if(b == 0xB8 || b == 0xB9){ + return 0x56; + }else + return 0x00; +} + +/* +* Arabic shaping needs to be done manually. Cannot call ArabicShaping class +* because CharsetDetector is dealing with bytes not Unicode code points. We could +* convert the bytes to Unicode code points but that would leave us dependent +* on CharsetICU which we try to avoid. IBM420 converter amongst different versions +* of JDK can produce different results and therefore is also avoided. +*/ +int32_t NGramParser_IBM420::nextByte(InputText *det) +{ + + if (byteIndex >= det->fInputLen || det->fInputBytes[byteIndex] == 0) { + return -1; + } + int next; + + alef = isLamAlef(det->fInputBytes[byteIndex]); + if(alef != 0x00) + next = 0xB1 & 0xFF; + else + next = unshapeMap_IBM420[det->fInputBytes[byteIndex]& 0xFF] & 0xFF; + + byteIndex++; + + return next; +} + +void NGramParser_IBM420::parseCharacters(InputText *det) +{ + int32_t b; + bool ignoreSpace = FALSE; + + while ((b = nextByte(det)) >= 0) { + uint8_t mb = charMap[b]; + + // TODO: 0x20 might not be a space in all character sets... + if (mb != 0) { + if (!(mb == 0x20 && ignoreSpace)) { + addByte(mb); + } + ignoreSpace = (mb == 0x20); + } + + if(alef != 0x00){ + mb = charMap[alef & 0xFF]; + + // TODO: 0x20 might not be a space in all character sets... + if (mb != 0) { + if (!(mb == 0x20 && ignoreSpace)) { + addByte(mb); + } + + ignoreSpace = (mb == 0x20); + } + + } + } +} +#endif + CharsetRecog_sbcs::CharsetRecog_sbcs() { // nothing else to do @@ -523,6 +632,7 @@ static const uint8_t charMap_KOI8_R[] = { 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, }; +#if !UCONFIG_ONLY_HTML_CONVERSION static const int32_t ngrams_IBM424_he_rtl[] = { 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641, 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045, @@ -590,6 +700,7 @@ static const uint8_t charMap_IBM420_ar[]= { /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF, /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40, }; +#endif //ISO-8859-1,2,5,6,7,8,9 Ngrams @@ -782,7 +893,7 @@ UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const const char *name = textIn->fC1Bytes? "windows-1252" : "ISO-8859-1"; uint32_t i; int32_t bestConfidenceSoFar = -1; - for (i=0; i < ARRAY_SIZE(ngrams_8859_1) ; i++) { + for (i=0; i < UPRV_LENGTHOF(ngrams_8859_1) ; i++) { const int32_t *ngrams = ngrams_8859_1[i].ngrams; const char *lang = ngrams_8859_1[i].lang; int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_1); @@ -791,6 +902,10 @@ UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const bestConfidenceSoFar = confidence; } } + if (bestConfidenceSoFar < 10 && textIn->fOnlyTypicalASCII) { // rdar://56373519 + bestConfidenceSoFar = 15; + results->set(textIn, this, bestConfidenceSoFar, name); + } return (bestConfidenceSoFar > 0); } @@ -809,7 +924,7 @@ UBool CharsetRecog_8859_2::match(InputText *textIn, CharsetMatch *results) const const char *name = textIn->fC1Bytes? "windows-1250" : "ISO-8859-2"; uint32_t i; int32_t bestConfidenceSoFar = -1; - for (i=0; i < ARRAY_SIZE(ngrams_8859_2) ; i++) { + for (i=0; i < UPRV_LENGTHOF(ngrams_8859_2) ; i++) { const int32_t *ngrams = ngrams_8859_2[i].ngrams; const char *lang = ngrams_8859_2[i].lang; int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_2); @@ -1054,6 +1169,7 @@ UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const return (confidence > 0); } +#if !UCONFIG_ONLY_HTML_CONVERSION CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he() { // nothing to do @@ -1098,26 +1214,6 @@ UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results return (confidence > 0); } -static const uint8_t unshapeMap_IBM420[] = { -/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ -/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, -/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, -/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, -/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, -/* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, -/* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, -/* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, -/* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, -/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F, -/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E, -/* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF, -/* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF, -/* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF, -/* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF, -/* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, -/* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, -}; - CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar() { // nothing to do @@ -1128,88 +1224,15 @@ const char *CharsetRecog_IBM420_ar::getLanguage() const return "ar"; } -void CharsetRecog_IBM420_ar::matchInit(InputText *textIn) { - prev_fInputBytesLength = textIn->fInputLen; - prev_fInputBytes = textIn->fInputBytes; - int32_t length = 0; - uint8_t *bb = unshape(prev_fInputBytes, prev_fInputBytesLength, length); - - if (bb != NULL) { - textIn->fInputBytes = bb; - textIn->fInputLen = length; - - deleteBuffer = TRUE; - } else { - deleteBuffer = FALSE; - } -} - -uint8_t *CharsetRecog_IBM420_ar::unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) { - uint8_t *resultArray = unshapeLamAlef(inputBytes, inputBytesLength, length); - - if (resultArray != NULL) { - for (int32_t i = 0; i < inputBytesLength; i++) { - resultArray[i] = unshapeMap_IBM420[resultArray[i]]; - } - } - - return resultArray; -} - -uint8_t *CharsetRecog_IBM420_ar::unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) { - int32_t bigBufferLength = inputBytesLength * 2; - uint8_t *bigBuffer = (uint8_t *)uprv_malloc(bigBufferLength); - uint8_t *resultBuffer = NULL; - - if (bigBuffer != NULL) { - int32_t bufferIndex; - static const uint8_t unshapedLamAlef[] = { 0xb1, 0x56 }; - - for (int32_t i = bufferIndex = 0; i < inputBytesLength; i++) { - if (isLamAlef(inputBytes[i])) { - bigBuffer[bufferIndex++] = unshapedLamAlef[0]; - bigBuffer[bufferIndex++] = unshapedLamAlef[1]; - } else { - bigBuffer[bufferIndex++] = inputBytes[i]; - } - } - - length = bufferIndex; - resultBuffer = (uint8_t *)uprv_malloc(length); - if (resultBuffer != NULL) { - uprv_memcpy(resultBuffer, bigBuffer, length); - } - } - - if (bigBuffer != NULL) { - uprv_free(bigBuffer); - } +int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const +{ + NGramParser_IBM420 parser(ngrams, byteMap); + int32_t result; - return resultBuffer; -} - -void CharsetRecog_IBM420_ar::matchFinish(InputText *textIn) { - if (deleteBuffer) { - uprv_free(textIn->fInputBytes); + result = parser.parse(det); - textIn->fInputBytes = prev_fInputBytes; - textIn->fInputLen = prev_fInputBytesLength; - } -} - -UBool CharsetRecog_IBM420_ar::isLamAlef(uint8_t b) { - static const uint8_t shapedLamAlef[] = { - 0xb2, 0xb3, 0xb4, 0xb5, 0xb7, 0xb8 - }; - - for (uint32_t i = 0; i < sizeof(shapedLamAlef); i++) { - if (b == shapedLamAlef[i]) { - return TRUE; - } - } - - return FALSE; + return result; } CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl() @@ -1245,6 +1268,7 @@ UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results results->set(textIn, this, confidence); return (confidence > 0); } +#endif U_NAMESPACE_END #endif