+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
- * Copyright (C) 2005-2006, International Business Machines
+ * Copyright (C) 2005-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
class NGramParser : public UMemory
{
private:
- int32_t byteIndex;
int32_t ngram;
-
- const int32_t *ngramList;
- const uint8_t *charMap;
+ const int32_t *ngramList;
int32_t ngramCount;
int32_t hitCount;
+protected:
+ int32_t byteIndex;
+ const uint8_t *charMap;
+
+ void addByte(int32_t b);
+
public:
NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap);
+ virtual ~NGramParser();
private:
/*
int32_t search(const int32_t *table, int32_t value);
void lookup(int32_t thisNgram);
- void addByte(int32_t b);
- int32_t nextByte(InputText *det);
+
+ virtual int32_t nextByte(InputText *det);
+ virtual void parseCharacters(InputText *det);
public:
int32_t parse(InputText *det);
};
-class CharsetRecog_sbcs : public CharsetRecognizer
+#if !UCONFIG_ONLY_HTML_CONVERSION
+class NGramParser_IBM420 : public NGramParser
{
-protected:
- UBool haveC1Bytes;
+public:
+ NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap);
+ ~NGramParser_IBM420();
+private:
+ int32_t alef;
+ int32_t isLamAlef(int32_t b);
+ int32_t nextByte(InputText *det);
+ void parseCharacters(InputText *det);
+};
+#endif
+
+
+class CharsetRecog_sbcs : public CharsetRecognizer
+{
public:
CharsetRecog_sbcs();
-
virtual ~CharsetRecog_sbcs();
-
virtual const char *getName() const = 0;
-
- virtual int32_t match(InputText *det) = 0;
-
- int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]);
+ virtual UBool match(InputText *det, CharsetMatch *results) const = 0;
+ virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
};
class CharsetRecog_8859_1 : public CharsetRecog_sbcs
{
public:
virtual ~CharsetRecog_8859_1();
-
const char *getName() const;
+ virtual UBool match(InputText *det, CharsetMatch *results) const;
};
class CharsetRecog_8859_2 : public CharsetRecog_sbcs
{
public:
virtual ~CharsetRecog_8859_2();
-
const char *getName() const;
+ virtual UBool match(InputText *det, CharsetMatch *results) const;
};
class CharsetRecog_8859_5 : public CharsetRecog_sbcs
{
public:
virtual ~CharsetRecog_8859_5();
-
const char *getName() const;
};
const char *getName() const;
};
-class CharsetRecog_8859_1_en : public CharsetRecog_8859_1
-{
-public:
- virtual ~CharsetRecog_8859_1_en();
-
- const char *getLanguage() const;
-
- int32_t match(InputText *textIn);
-};
-
-class CharsetRecog_8859_1_da : public CharsetRecog_8859_1
-{
-public:
- virtual ~CharsetRecog_8859_1_da();
-
- const char *getLanguage() const;
- int32_t match(InputText *textIn);
-};
-class CharsetRecog_8859_1_de : public CharsetRecog_8859_1
-{
-public:
- virtual ~CharsetRecog_8859_1_de();
-
- const char *getLanguage() const;
-
- int32_t match(InputText *textIn);
-};
-
-class CharsetRecog_8859_1_es : public CharsetRecog_8859_1
+class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5
{
public:
- virtual ~CharsetRecog_8859_1_es();
+ virtual ~CharsetRecog_8859_5_ru();
const char *getLanguage() const;
- int32_t match(InputText *textIn);
+ virtual UBool match(InputText *det, CharsetMatch *results) const;
};
-class CharsetRecog_8859_1_fr : public CharsetRecog_8859_1
+class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6
{
public:
- virtual ~CharsetRecog_8859_1_fr();
+ virtual ~CharsetRecog_8859_6_ar();
const char *getLanguage() const;
- int32_t match(InputText *textIn);
+ virtual UBool match(InputText *det, CharsetMatch *results) const;
};
-class CharsetRecog_8859_1_it : public CharsetRecog_8859_1
+class CharsetRecog_8859_7_el : public CharsetRecog_8859_7
{
public:
- virtual ~CharsetRecog_8859_1_it();
+ virtual ~CharsetRecog_8859_7_el();
const char *getLanguage() const;
- int32_t match(InputText *textIn);
+ virtual UBool match(InputText *det, CharsetMatch *results) const;
};
-class CharsetRecog_8859_1_nl : public CharsetRecog_8859_1
+class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8
{
public:
- virtual ~CharsetRecog_8859_1_nl();
+ virtual ~CharsetRecog_8859_8_I_he();
+
+ const char *getName() const;
const char *getLanguage() const;
- int32_t match(InputText *textIn);
+ virtual UBool match(InputText *det, CharsetMatch *results) const;
};
-class CharsetRecog_8859_1_no : public CharsetRecog_8859_1
+class CharsetRecog_8859_8_he : public CharsetRecog_8859_8
{
public:
- virtual ~CharsetRecog_8859_1_no();
+ virtual ~CharsetRecog_8859_8_he ();
const char *getLanguage() const;
- int32_t match(InputText *textIn);
+ virtual UBool match(InputText *det, CharsetMatch *results) const;
};
-class CharsetRecog_8859_1_pt : public CharsetRecog_8859_1
+class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9
{
public:
- virtual ~CharsetRecog_8859_1_pt();
+ virtual ~CharsetRecog_8859_9_tr ();
const char *getLanguage() const;
- int32_t match(InputText *textIn);
+ virtual UBool match(InputText *det, CharsetMatch *results) const;
};
-class CharsetRecog_8859_1_sv : public CharsetRecog_8859_1
+class CharsetRecog_windows_1256 : public CharsetRecog_sbcs
{
public:
- virtual ~CharsetRecog_8859_1_sv();
-
- const char *getLanguage() const;
-
- int32_t match(InputText *textIn);
-};
+ virtual ~CharsetRecog_windows_1256();
-class CharsetRecog_8859_2_cs : public CharsetRecog_8859_2
-{
-public:
- virtual ~CharsetRecog_8859_2_cs();
+ const char *getName() const;
const char *getLanguage() const;
- int32_t match(InputText *textIn);
+ virtual UBool match(InputText *det, CharsetMatch *results) const;
};
-class CharsetRecog_8859_2_hu : public CharsetRecog_8859_2
+class CharsetRecog_windows_1251 : public CharsetRecog_sbcs
{
public:
- virtual ~CharsetRecog_8859_2_hu();
-
- const char *getLanguage() const;
-
- int32_t match(InputText *textIn);
-};
+ virtual ~CharsetRecog_windows_1251();
-class CharsetRecog_8859_2_pl : public CharsetRecog_8859_2
-{
-public:
- virtual ~CharsetRecog_8859_2_pl();
+ const char *getName() const;
const char *getLanguage() const;
- int32_t match(InputText *textIn);
+ virtual UBool match(InputText *det, CharsetMatch *results) const;
};
-class CharsetRecog_8859_2_ro : public CharsetRecog_8859_2
-{
-public:
- virtual ~CharsetRecog_8859_2_ro();
-
- const char *getLanguage() const;
-
- int32_t match(InputText *textIn);
-};
-class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5
+class CharsetRecog_KOI8_R : public CharsetRecog_sbcs
{
public:
- virtual ~CharsetRecog_8859_5_ru();
-
- const char *getLanguage() const;
-
- int32_t match(InputText *textIn);
-};
+ virtual ~CharsetRecog_KOI8_R();
-class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6
-{
-public:
- virtual ~CharsetRecog_8859_6_ar();
+ const char *getName() const;
const char *getLanguage() const;
- int32_t match(InputText *textIn);
+ virtual UBool match(InputText *det, CharsetMatch *results) const;
};
-class CharsetRecog_8859_7_el : public CharsetRecog_8859_7
+#if !UCONFIG_ONLY_HTML_CONVERSION
+class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
{
public:
- virtual ~CharsetRecog_8859_7_el();
+ virtual ~CharsetRecog_IBM424_he();
const char *getLanguage() const;
-
- int32_t match(InputText *textIn);
};
-class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8
-{
+class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he {
public:
- virtual ~CharsetRecog_8859_8_I_he();
-
+ virtual ~CharsetRecog_IBM424_he_rtl();
+
const char *getName() const;
-
- const char *getLanguage() const;
-
- int32_t match(InputText *textIn);
-};
-
-class CharsetRecog_8859_8_he : public CharsetRecog_8859_8
-{
-public:
- virtual ~CharsetRecog_8859_8_he ();
-
- const char *getLanguage() const;
-
- int32_t match(InputText *textIn);
+
+ virtual UBool match(InputText *det, CharsetMatch *results) const;
};
-class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9
-{
-public:
- virtual ~CharsetRecog_8859_9_tr ();
-
- const char *getLanguage() const;
-
- int32_t match(InputText *textIn);
+class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he {
+ virtual ~CharsetRecog_IBM424_he_ltr();
+
+ const char *getName() const;
+
+ virtual UBool match(InputText *det, CharsetMatch *results) const;
};
-class CharsetRecog_windows_1256 : public CharsetRecog_sbcs
+class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs
{
public:
- virtual ~CharsetRecog_windows_1256();
-
- const char *getName() const;
+ virtual ~CharsetRecog_IBM420_ar();
const char *getLanguage() const;
-
- int32_t match(InputText *textIn);
+ int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
+
};
-class CharsetRecog_windows_1251 : public CharsetRecog_sbcs
-{
+class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {
public:
- virtual ~CharsetRecog_windows_1251();
-
+ virtual ~CharsetRecog_IBM420_ar_rtl();
+
const char *getName() const;
-
- const char *getLanguage() const;
-
- int32_t match(InputText *textIn);
+
+ virtual UBool match(InputText *det, CharsetMatch *results) const;
};
-
-class CharsetRecog_KOI8_R : public CharsetRecog_sbcs
-{
-public:
- virtual ~CharsetRecog_KOI8_R();
-
+class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {
+ virtual ~CharsetRecog_IBM420_ar_ltr();
+
const char *getName() const;
-
- const char *getLanguage() const;
-
- int32_t match(InputText *textIn);
+
+ virtual UBool match(InputText *det, CharsetMatch *results) const;
};
+#endif
U_NAMESPACE_END
-#endif
+#endif /* !UCONFIG_NO_CONVERSION */
#endif /* __CSRSBCS_H */