+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/**
*******************************************************************************
- * Copyright (C) 2006,2011, International Business Machines Corporation *
+ * Copyright (C) 2006-2014, International Business Machines Corporation *
* and others. All Rights Reserved. *
*******************************************************************************
*/
#include "unicode/utext.h"
#include "brkeng.h"
+#include "uvectr32.h"
U_NAMESPACE_BEGIN
-class TrieWordDictionary;
+class DictionaryMatcher;
+class Normalizer2;
/*******************************************************************
* DictionaryBreakEngine
UnicodeSet fSet;
- /**
- * The set of break types handled by this engine
- * @internal
- */
-
- uint32_t fTypes;
+ public:
/**
- * <p>Default constructor.</p>
- *
+ * <p>Constructor </p>
*/
DictionaryBreakEngine();
- public:
+ /**
+ * <p>Virtual destructor.</p>
+ */
+ virtual ~DictionaryBreakEngine();
/**
- * <p>Constructor setting the break types handled.</p>
+ * <p>Indicate whether this engine handles a particular character for
+ * a particular kind of break.</p>
*
- * @param breakTypes A bitmap of types handled by the engine.
+ * @param c A character which begins a run that the engine might handle
+ * @return TRUE if this engine handles the particular character and break
+ * type.
*/
- DictionaryBreakEngine( uint32_t breakTypes );
+ virtual UBool handles(UChar32 c) const;
/**
- * <p>Virtual destructor.</p>
+ * <p>Find any breaks within a run in the supplied text.</p>
+ *
+ * @param text A UText representing the text. The iterator is left at
+ * the end of the run of characters which the engine is capable of handling
+ * that starts from the first character in the range.
+ * @param startPos The start of the run within the supplied text.
+ * @param endPos The end of the run within the supplied text.
+ * @param foundBreaks vector of int32_t to receive the break positions
+ * @return The number of breaks found.
*/
- virtual ~DictionaryBreakEngine();
-
- /**
- * <p>Indicate whether this engine handles a particular character for
- * a particular kind of break.</p>
- *
- * @param c A character which begins a run that the engine might handle
- * @param breakType The type of text break which the caller wants to determine
- * @return TRUE if this engine handles the particular character and break
- * type.
- */
- virtual UBool handles( UChar32 c, int32_t breakType ) const;
-
- /**
- * <p>Find any breaks within a run in the supplied text.</p>
- *
- * @param text A UText representing the text. The
- * iterator is left at the end of the run of characters which the engine
- * is capable of handling.
- * @param startPos The start of the run within the supplied text.
- * @param endPos The end of the run within the supplied text.
- * @param reverse Whether the caller is looking for breaks in a reverse
- * direction.
- * @param breakType The type of break desired, or -1.
- * @param foundBreaks An allocated C array of the breaks found, if any
- * @return The number of breaks found.
- */
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
- UBool reverse,
- int32_t breakType,
- UStack &foundBreaks ) const;
+ UVector32 &foundBreaks ) const;
protected:
virtual void setCharacters( const UnicodeSet &set );
/**
- * <p>Set the break types handled by this engine.</p>
- *
- * @param breakTypes A bitmap of types handled by the engine.
- */
-// virtual void setBreakTypes( uint32_t breakTypes );
-
- /**
- * <p>Divide up a range of known dictionary characters.</p>
+ * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
*
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
- UStack &foundBreaks ) const = 0;
+ UVector32 &foundBreaks ) const = 0;
};
/**
* <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
- * TrieWordDictionary and heuristics to determine Thai-specific breaks.</p>
+ * dictionary and heuristics to determine Thai-specific breaks.</p>
*
* <p>After it is constructed a ThaiBreakEngine may be shared between
* threads without synchronization.</p>
UnicodeSet fBeginWordSet;
UnicodeSet fSuffixSet;
UnicodeSet fMarkSet;
- const TrieWordDictionary *fDictionary;
+ DictionaryMatcher *fDictionary;
public:
/**
* <p>Default constructor.</p>
*
- * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
+ * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
* engine is deleted.
*/
- ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status);
+ ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
/**
* <p>Virtual destructor.</p>
protected:
/**
- * <p>Divide up a range of known dictionary characters.</p>
+ * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
*
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
- UStack &foundBreaks ) const;
+ UVector32 &foundBreaks ) const;
};
+/*******************************************************************
+ * LaoBreakEngine
+ */
+
+/**
+ * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
+ * dictionary and heuristics to determine Lao-specific breaks.</p>
+ *
+ * <p>After it is constructed a LaoBreakEngine may be shared between
+ * threads without synchronization.</p>
+ */
+class LaoBreakEngine : public DictionaryBreakEngine {
+ private:
+ /**
+ * The set of characters handled by this engine
+ * @internal
+ */
+
+ UnicodeSet fLaoWordSet;
+ UnicodeSet fEndWordSet;
+ UnicodeSet fBeginWordSet;
+ UnicodeSet fMarkSet;
+ DictionaryMatcher *fDictionary;
+
+ public:
+ /**
+ * <p>Default constructor.</p>
+ *
+ * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
+ * engine is deleted.
+ */
+ LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
+
+ /**
+ * <p>Virtual destructor.</p>
+ */
+ virtual ~LaoBreakEngine();
+
+ protected:
+ /**
+ * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
+ *
+ * @param text A UText representing the text
+ * @param rangeStart The start of the range of dictionary characters
+ * @param rangeEnd The end of the range of dictionary characters
+ * @param foundBreaks Output of C array of int32_t break positions, or 0
+ * @return The number of breaks found
+ */
+ virtual int32_t divideUpDictionaryRange( UText *text,
+ int32_t rangeStart,
+ int32_t rangeEnd,
+ UVector32 &foundBreaks ) const;
+
+};
+
+/*******************************************************************
+ * BurmeseBreakEngine
+ */
+
+/**
+ * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
+ * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
+ *
+ * <p>After it is constructed a BurmeseBreakEngine may be shared between
+ * threads without synchronization.</p>
+ */
+class BurmeseBreakEngine : public DictionaryBreakEngine {
+ private:
+ /**
+ * The set of characters handled by this engine
+ * @internal
+ */
+
+ UnicodeSet fBurmeseWordSet;
+ UnicodeSet fEndWordSet;
+ UnicodeSet fBeginWordSet;
+ UnicodeSet fMarkSet;
+ DictionaryMatcher *fDictionary;
+
+ public:
+
+ /**
+ * <p>Default constructor.</p>
+ *
+ * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
+ * engine is deleted.
+ */
+ BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
+
+ /**
+ * <p>Virtual destructor.</p>
+ */
+ virtual ~BurmeseBreakEngine();
+
+ protected:
+ /**
+ * <p>Divide up a range of known dictionary characters.</p>
+ *
+ * @param text A UText representing the text
+ * @param rangeStart The start of the range of dictionary characters
+ * @param rangeEnd The end of the range of dictionary characters
+ * @param foundBreaks Output of C array of int32_t break positions, or 0
+ * @return The number of breaks found
+ */
+ virtual int32_t divideUpDictionaryRange( UText *text,
+ int32_t rangeStart,
+ int32_t rangeEnd,
+ UVector32 &foundBreaks ) const;
+
+};
+
/*******************************************************************
* KhmerBreakEngine
*/
/**
* <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
- * TrieWordDictionary and heuristics to determine Khmer-specific breaks.</p>
+ * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
*
* <p>After it is constructed a KhmerBreakEngine may be shared between
* threads without synchronization.</p>
UnicodeSet fEndWordSet;
UnicodeSet fBeginWordSet;
UnicodeSet fMarkSet;
- const TrieWordDictionary *fDictionary;
+ DictionaryMatcher *fDictionary;
public:
/**
* <p>Default constructor.</p>
*
- * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
+ * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
* engine is deleted.
*/
- KhmerBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status);
+ KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
/**
* <p>Virtual destructor.</p>
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
- UStack &foundBreaks ) const;
+ UVector32 &foundBreaks ) const;
};
-
+#if !UCONFIG_NO_NORMALIZATION
+
+/*******************************************************************
+ * CjkBreakEngine
+ */
+
+//indicates language/script that the CjkBreakEngine will handle
+enum LanguageType {
+ kKorean,
+ kChineseJapanese
+};
+
+/**
+ * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
+ * dictionary with costs associated with each word and
+ * Viterbi decoding to determine CJK-specific breaks.</p>
+ */
+class CjkBreakEngine : public DictionaryBreakEngine {
+ protected:
+ /**
+ * The set of characters handled by this engine
+ * @internal
+ */
+ UnicodeSet fHangulWordSet;
+ UnicodeSet fHanWordSet;
+ UnicodeSet fKatakanaWordSet;
+ UnicodeSet fHiraganaWordSet;
+
+ DictionaryMatcher *fDictionary;
+ const Normalizer2 *nfkcNorm2;
+
+ public:
+
+ /**
+ * <p>Default constructor.</p>
+ *
+ * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
+ * engine is deleted. The DictionaryMatcher must contain costs for each word
+ * in order for the dictionary to work properly.
+ */
+ CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
+
+ /**
+ * <p>Virtual destructor.</p>
+ */
+ virtual ~CjkBreakEngine();
+
+ protected:
+ /**
+ * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
+ *
+ * @param text A UText representing the text
+ * @param rangeStart The start of the range of dictionary characters
+ * @param rangeEnd The end of the range of dictionary characters
+ * @param foundBreaks Output of C array of int32_t break positions, or 0
+ * @return The number of breaks found
+ */
+ virtual int32_t divideUpDictionaryRange( UText *text,
+ int32_t rangeStart,
+ int32_t rangeEnd,
+ UVector32 &foundBreaks ) const;
+
+};
+
+#endif
+
U_NAMESPACE_END
/* DICTBE_H */