X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/4388f060552cc537e71e957d32f35e9d75a61233..a01113dcd0f39d5da295ef82785beff9ed86fe38:/icuSources/common/dictbe.h diff --git a/icuSources/common/dictbe.h b/icuSources/common/dictbe.h index 81864ebf..731bfdff 100644 --- a/icuSources/common/dictbe.h +++ b/icuSources/common/dictbe.h @@ -1,6 +1,8 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /** ******************************************************************************* - * Copyright (C) 2006,2011, International Business Machines Corporation * + * Copyright (C) 2006-2014, International Business Machines Corporation * * and others. All Rights Reserved. * ******************************************************************************* */ @@ -13,10 +15,12 @@ #include "unicode/utext.h" #include "brkeng.h" +#include "uvectr32.h" U_NAMESPACE_BEGIN -class TrieWordDictionary; +class DictionaryMatcher; +class Normalizer2; /******************************************************************* * DictionaryBreakEngine @@ -38,64 +42,43 @@ class DictionaryBreakEngine : public LanguageBreakEngine { UnicodeSet fSet; - /** - * The set of break types handled by this engine - * @internal - */ - - uint32_t fTypes; + public: /** - *

Default constructor.

- * + *

Constructor

*/ DictionaryBreakEngine(); - public: + /** + *

Virtual destructor.

+ */ + virtual ~DictionaryBreakEngine(); /** - *

Constructor setting the break types handled.

+ *

Indicate whether this engine handles a particular character for + * a particular kind of break.

* - * @param breakTypes A bitmap of types handled by the engine. + * @param c A character which begins a run that the engine might handle + * @return TRUE if this engine handles the particular character and break + * type. */ - DictionaryBreakEngine( uint32_t breakTypes ); + virtual UBool handles(UChar32 c) const; /** - *

Virtual destructor.

+ *

Find any breaks within a run in the supplied text.

+ * + * @param text A UText representing the text. The iterator is left at + * the end of the run of characters which the engine is capable of handling + * that starts from the first character in the range. + * @param startPos The start of the run within the supplied text. + * @param endPos The end of the run within the supplied text. + * @param foundBreaks vector of int32_t to receive the break positions + * @return The number of breaks found. */ - virtual ~DictionaryBreakEngine(); - - /** - *

Indicate whether this engine handles a particular character for - * a particular kind of break.

- * - * @param c A character which begins a run that the engine might handle - * @param breakType The type of text break which the caller wants to determine - * @return TRUE if this engine handles the particular character and break - * type. - */ - virtual UBool handles( UChar32 c, int32_t breakType ) const; - - /** - *

Find any breaks within a run in the supplied text.

- * - * @param text A UText representing the text. The - * iterator is left at the end of the run of characters which the engine - * is capable of handling. - * @param startPos The start of the run within the supplied text. - * @param endPos The end of the run within the supplied text. - * @param reverse Whether the caller is looking for breaks in a reverse - * direction. - * @param breakType The type of break desired, or -1. - * @param foundBreaks An allocated C array of the breaks found, if any - * @return The number of breaks found. - */ virtual int32_t findBreaks( UText *text, int32_t startPos, int32_t endPos, - UBool reverse, - int32_t breakType, - UStack &foundBreaks ) const; + UVector32 &foundBreaks ) const; protected: @@ -107,14 +90,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine { virtual void setCharacters( const UnicodeSet &set ); /** - *

Set the break types handled by this engine.

- * - * @param breakTypes A bitmap of types handled by the engine. - */ -// virtual void setBreakTypes( uint32_t breakTypes ); - - /** - *

Divide up a range of known dictionary characters.

+ *

Divide up a range of known dictionary characters handled by this break engine.

* * @param text A UText representing the text * @param rangeStart The start of the range of dictionary characters @@ -125,7 +101,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine { virtual int32_t divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, - UStack &foundBreaks ) const = 0; + UVector32 &foundBreaks ) const = 0; }; @@ -135,7 +111,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine { /** *

ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a - * TrieWordDictionary and heuristics to determine Thai-specific breaks.

+ * dictionary and heuristics to determine Thai-specific breaks.

* *

After it is constructed a ThaiBreakEngine may be shared between * threads without synchronization.

@@ -152,17 +128,17 @@ class ThaiBreakEngine : public DictionaryBreakEngine { UnicodeSet fBeginWordSet; UnicodeSet fSuffixSet; UnicodeSet fMarkSet; - const TrieWordDictionary *fDictionary; + DictionaryMatcher *fDictionary; public: /** *

Default constructor.

* - * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the * engine is deleted. */ - ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status); + ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); /** *

Virtual destructor.

@@ -171,7 +147,7 @@ class ThaiBreakEngine : public DictionaryBreakEngine { protected: /** - *

Divide up a range of known dictionary characters.

+ *

Divide up a range of known dictionary characters handled by this break engine.

* * @param text A UText representing the text * @param rangeStart The start of the range of dictionary characters @@ -182,18 +158,129 @@ class ThaiBreakEngine : public DictionaryBreakEngine { virtual int32_t divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, - UStack &foundBreaks ) const; + UVector32 &foundBreaks ) const; }; +/******************************************************************* + * LaoBreakEngine + */ + +/** + *

LaoBreakEngine is a kind of DictionaryBreakEngine that uses a + * dictionary and heuristics to determine Lao-specific breaks.

+ * + *

After it is constructed a LaoBreakEngine may be shared between + * threads without synchronization.

+ */ +class LaoBreakEngine : public DictionaryBreakEngine { + private: + /** + * The set of characters handled by this engine + * @internal + */ + + UnicodeSet fLaoWordSet; + UnicodeSet fEndWordSet; + UnicodeSet fBeginWordSet; + UnicodeSet fMarkSet; + DictionaryMatcher *fDictionary; + + public: + /** + *

Default constructor.

+ * + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the + * engine is deleted. + */ + LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); + + /** + *

Virtual destructor.

+ */ + virtual ~LaoBreakEngine(); + + protected: + /** + *

Divide up a range of known dictionary characters handled by this break engine.

+ * + * @param text A UText representing the text + * @param rangeStart The start of the range of dictionary characters + * @param rangeEnd The end of the range of dictionary characters + * @param foundBreaks Output of C array of int32_t break positions, or 0 + * @return The number of breaks found + */ + virtual int32_t divideUpDictionaryRange( UText *text, + int32_t rangeStart, + int32_t rangeEnd, + UVector32 &foundBreaks ) const; + +}; + +/******************************************************************* + * BurmeseBreakEngine + */ + +/** + *

BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a + * DictionaryMatcher and heuristics to determine Burmese-specific breaks.

+ * + *

After it is constructed a BurmeseBreakEngine may be shared between + * threads without synchronization.

+ */ +class BurmeseBreakEngine : public DictionaryBreakEngine { + private: + /** + * The set of characters handled by this engine + * @internal + */ + + UnicodeSet fBurmeseWordSet; + UnicodeSet fEndWordSet; + UnicodeSet fBeginWordSet; + UnicodeSet fMarkSet; + DictionaryMatcher *fDictionary; + + public: + + /** + *

Default constructor.

+ * + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the + * engine is deleted. + */ + BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); + + /** + *

Virtual destructor.

+ */ + virtual ~BurmeseBreakEngine(); + + protected: + /** + *

Divide up a range of known dictionary characters.

+ * + * @param text A UText representing the text + * @param rangeStart The start of the range of dictionary characters + * @param rangeEnd The end of the range of dictionary characters + * @param foundBreaks Output of C array of int32_t break positions, or 0 + * @return The number of breaks found + */ + virtual int32_t divideUpDictionaryRange( UText *text, + int32_t rangeStart, + int32_t rangeEnd, + UVector32 &foundBreaks ) const; + +}; + /******************************************************************* * KhmerBreakEngine */ /** *

KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a - * TrieWordDictionary and heuristics to determine Khmer-specific breaks.

+ * DictionaryMatcher and heuristics to determine Khmer-specific breaks.

* *

After it is constructed a KhmerBreakEngine may be shared between * threads without synchronization.

@@ -209,17 +296,17 @@ class KhmerBreakEngine : public DictionaryBreakEngine { UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fMarkSet; - const TrieWordDictionary *fDictionary; + DictionaryMatcher *fDictionary; public: /** *

Default constructor.

* - * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the * engine is deleted. */ - KhmerBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status); + KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); /** *

Virtual destructor.

@@ -239,11 +326,76 @@ class KhmerBreakEngine : public DictionaryBreakEngine { virtual int32_t divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, - UStack &foundBreaks ) const; + UVector32 &foundBreaks ) const; }; - +#if !UCONFIG_NO_NORMALIZATION + +/******************************************************************* + * CjkBreakEngine + */ + +//indicates language/script that the CjkBreakEngine will handle +enum LanguageType { + kKorean, + kChineseJapanese +}; + +/** + *

CjkBreakEngine is a kind of DictionaryBreakEngine that uses a + * dictionary with costs associated with each word and + * Viterbi decoding to determine CJK-specific breaks.

+ */ +class CjkBreakEngine : public DictionaryBreakEngine { + protected: + /** + * The set of characters handled by this engine + * @internal + */ + UnicodeSet fHangulWordSet; + UnicodeSet fHanWordSet; + UnicodeSet fKatakanaWordSet; + UnicodeSet fHiraganaWordSet; + + DictionaryMatcher *fDictionary; + const Normalizer2 *nfkcNorm2; + + public: + + /** + *

Default constructor.

+ * + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the + * engine is deleted. The DictionaryMatcher must contain costs for each word + * in order for the dictionary to work properly. + */ + CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); + + /** + *

Virtual destructor.

+ */ + virtual ~CjkBreakEngine(); + + protected: + /** + *

Divide up a range of known dictionary characters handled by this break engine.

+ * + * @param text A UText representing the text + * @param rangeStart The start of the range of dictionary characters + * @param rangeEnd The end of the range of dictionary characters + * @param foundBreaks Output of C array of int32_t break positions, or 0 + * @return The number of breaks found + */ + virtual int32_t divideUpDictionaryRange( UText *text, + int32_t rangeStart, + int32_t rangeEnd, + UVector32 &foundBreaks ) const; + +}; + +#endif + U_NAMESPACE_END /* DICTBE_H */