icuSources/i18n/csrmbcs.h

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2012, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8 #ifndef __CSRMBCS_H
   9 #define __CSRMBCS_H
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_CONVERSION
  14
  15 #include "csrecog.h"
  16
  17 U_NAMESPACE_BEGIN
  18
  19 // "Character"  iterated character class.
  20 //    Recognizers for specific mbcs encodings make their "characters" available
  21 //    by providing a nextChar() function that fills in an instance of IteratedChar
  22 //    with the next char from the input.
  23 //    The returned characters are not converted to Unicode, but remain as the raw
  24 //    bytes (concatenated into an int) from the codepage data.
  25 //
  26 //  For Asian charsets, use the raw input rather than the input that has been
  27 //   stripped of markup.  Detection only considers multi-byte chars, effectively
  28 //   stripping markup anyway, and double byte chars do occur in markup too.
  29 //
  30 class IteratedChar : public UMemory
  31 {
  32 public:
  33     uint32_t charValue;             // 1-4 bytes from the raw input data
  34     int32_t  index;
  35     int32_t  nextIndex;
  36     UBool    error;
  37     UBool    done;
  38
  39 public:
  40     IteratedChar();
  41     //void reset();
  42     int32_t nextByte(InputText* det);
  43 };
  44
  45 #if U_PLATFORM_IS_DARWIN_BASED
  46 #define MAX_KEY_STRING_WITH_NULL 16
  47 #endif
  48
  49 class CharsetRecog_mbcs : public CharsetRecognizer {
  50
  51 protected:
  52     /**
  53      * Test the match of this charset with the input text data
  54      *      which is obtained via the CharsetDetector object.
  55      *
  56      * @param det  The CharsetDetector, which contains the input text
  57      *             to be checked for being in this charset.
  58      * @return     Two values packed into one int  (Damn java, anyhow)
  59      *             <br/>
  60      *             bits 0-7:  the match confidence, ranging from 0-100
  61      *             <br/>
  62      *             bits 8-15: The match reason, an enum-like value.
  63      */
  64 #if U_PLATFORM_IS_DARWIN_BASED
  65     int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen, const uint8_t (*keyStrings)[MAX_KEY_STRING_WITH_NULL] ) const;
  66 #else
  67     int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const;
  68 #endif
  69
  70 public:
  71
  72     virtual ~CharsetRecog_mbcs();
  73
  74     /**
  75      * Get the IANA name of this charset.
  76      * @return the charset name.
  77      */
  78
  79     const char *getName() const = 0;
  80     const char *getLanguage() const = 0;
  81     UBool match(InputText* input, CharsetMatch *results) const = 0;
  82
  83     /**
  84      * Get the next character (however many bytes it is) from the input data
  85      *    Subclasses for specific charset encodings must implement this function
  86      *    to get characters according to the rules of their encoding scheme.
  87      *
  88      *  This function is not a method of class IteratedChar only because
  89      *   that would require a lot of extra derived classes, which is awkward.
  90      * @param it  The IteratedChar "struct" into which the returned char is placed.
  91      * @param det The charset detector, which is needed to get at the input byte data
  92      *            being iterated over.
  93      * @return    True if a character was returned, false at end of input.
  94      */
  95     virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0;
  96
  97 };
  98
  99
 100 /**
 101  *   Shift-JIS charset recognizer.
 102  *
 103  */
 104 class CharsetRecog_sjis : public CharsetRecog_mbcs {
 105 public:
 106     virtual ~CharsetRecog_sjis();
 107
 108     UBool nextChar(IteratedChar *it, InputText *det) const;
 109
 110     UBool match(InputText* input, CharsetMatch *results) const;
 111
 112     const char *getName() const;
 113     const char *getLanguage() const;
 114
 115 };
 116
 117
 118 /**
 119  *   EUC charset recognizers.  One abstract class that provides the common function
 120  *             for getting the next character according to the EUC encoding scheme,
 121  *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
 122  *
 123  */
 124 class CharsetRecog_euc : public CharsetRecog_mbcs
 125 {
 126 public:
 127     virtual ~CharsetRecog_euc();
 128
 129     const char *getName() const = 0;
 130     const char *getLanguage() const = 0;
 131
 132     UBool match(InputText* input, CharsetMatch *results) const = 0;
 133     /*
 134      *  (non-Javadoc)
 135      *  Get the next character value for EUC based encodings.
 136      *  Character "value" is simply the raw bytes that make up the character
 137      *     packed into an int.
 138      */
 139     UBool nextChar(IteratedChar *it, InputText *det) const;
 140 };
 141
 142 /**
 143  * The charset recognize for EUC-JP.  A singleton instance of this class
 144  *    is created and kept by the public CharsetDetector class
 145  */
 146 class CharsetRecog_euc_jp : public CharsetRecog_euc
 147 {
 148 public:
 149     virtual ~CharsetRecog_euc_jp();
 150
 151     const char *getName() const;
 152     const char *getLanguage() const;
 153
 154     UBool match(InputText* input, CharsetMatch *results) const;
 155 };
 156
 157 /**
 158  * The charset recognize for EUC-KR.  A singleton instance of this class
 159  *    is created and kept by the public CharsetDetector class
 160  */
 161 class CharsetRecog_euc_kr : public CharsetRecog_euc
 162 {
 163 public:
 164     virtual ~CharsetRecog_euc_kr();
 165
 166     const char *getName() const;
 167     const char *getLanguage() const;
 168
 169     UBool match(InputText* input, CharsetMatch *results) const;
 170 };
 171
 172 /**
 173  *
 174  *   Big5 charset recognizer.
 175  *
 176  */
 177 class CharsetRecog_big5 : public CharsetRecog_mbcs
 178 {
 179 public:
 180     virtual ~CharsetRecog_big5();
 181
 182     UBool nextChar(IteratedChar* it, InputText* det) const;
 183
 184     const char *getName() const;
 185     const char *getLanguage() const;
 186
 187     UBool match(InputText* input, CharsetMatch *results) const;
 188 };
 189
 190
 191 /**
 192  *
 193  *   GB-18030 recognizer. Uses simplified Chinese statistics.
 194  *
 195  */
 196 class CharsetRecog_gb_18030 : public CharsetRecog_mbcs
 197 {
 198 public:
 199     virtual ~CharsetRecog_gb_18030();
 200
 201     UBool nextChar(IteratedChar* it, InputText* det) const;
 202
 203     const char *getName() const;
 204     const char *getLanguage() const;
 205
 206     UBool match(InputText* input, CharsetMatch *results) const;
 207 };
 208
 209 U_NAMESPACE_END
 210
 211 #endif
 212 #endif /* __CSRMBCS_H */