icuSources/i18n/csrutf8.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2012, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_CONVERSION
  11
  12 #include "csrutf8.h"
  13 #include "csmatch.h"
  14
  15 U_NAMESPACE_BEGIN
  16
  17 CharsetRecog_UTF8::~CharsetRecog_UTF8()
  18 {
  19     // nothing to do
  20 }
  21
  22 const char *CharsetRecog_UTF8::getName() const
  23 {
  24     return "UTF-8";
  25 }
  26
  27 UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
  28     bool hasBOM = FALSE;
  29     int32_t numValid = 0;
  30     int32_t numInvalid = 0;
  31     const uint8_t *inputBytes = input->fRawInput;
  32     int32_t i;
  33     int32_t trailBytes = 0;
  34     int32_t confidence;
  35
  36     if (input->fRawLength >= 3 &&
  37         inputBytes[0] == 0xEF && inputBytes[1] == 0xBB && inputBytes[2] == 0xBF) {
  38             hasBOM = TRUE;
  39     }
  40
  41     // Scan for multi-byte sequences
  42     for (i=0; i < input->fRawLength; i += 1) {
  43         int32_t b = inputBytes[i];
  44
  45         if ((b & 0x80) == 0) {
  46             continue;   // ASCII
  47         }
  48
  49         // Hi bit on char found.  Figure out how long the sequence should be
  50         if ((b & 0x0E0) == 0x0C0) {
  51             trailBytes = 1;
  52         } else if ((b & 0x0F0) == 0x0E0) {
  53             trailBytes = 2;
  54         } else if ((b & 0x0F8) == 0xF0) {
  55             trailBytes = 3;
  56         } else {
  57             numInvalid += 1;
  58
  59             if (numInvalid > 5) {
  60                 break;
  61             }
  62
  63             trailBytes = 0;
  64         }
  65
  66         // Verify that we've got the right number of trail bytes in the sequence
  67         for (;;) {
  68             i += 1;
  69
  70             if (i >= input->fRawLength) {
  71                 break;
  72             }
  73
  74             b = inputBytes[i];
  75
  76             if ((b & 0xC0) != 0x080) {
  77                 numInvalid += 1;
  78                 break;
  79             }
  80
  81             if (--trailBytes == 0) {
  82                 numValid += 1;
  83                 break;
  84             }
  85         }
  86
  87     }
  88
  89     // Cook up some sort of confidence score, based on presense of a BOM
  90     //    and the existence of valid and/or invalid multi-byte sequences.
  91     confidence = 0;
  92     if (hasBOM && numInvalid == 0) {
  93         confidence = 100;
  94     } else if (hasBOM && numValid > numInvalid*10) {
  95         confidence = 80;
  96     } else if (numValid > 3 && numInvalid == 0) {
  97         confidence = 100;
  98     } else if (numValid > 0 && numInvalid == 0) {
  99         confidence = 80;
 100     } else if (numValid == 0 && numInvalid == 0) {
 101         // Plain ASCII.
 102         confidence = 10;
 103     } else if (numValid > numInvalid*10) {
 104         // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
 105         confidence = 25;
 106     }
 107
 108     results->set(input, this, confidence);
 109     return (confidence > 0);
 110 }
 111
 112 U_NAMESPACE_END
 113 #endif