git.saurik.com Git - apple/icu.git/blob - icuSources/common/dictionarydata.h

2 // License & terms of use: http://www.unicode.org/copyright.html

3 /*

4 *******************************************************************************

7 *******************************************************************************

8 * dictionarydata.h

9 *

10 * created on: 2012may31

11 * created by: Markus W. Scherer & Maxime Serrano

12 */

14 #ifndef __DICTIONARYDATA_H__

15 #define __DICTIONARYDATA_H__

17 #include "unicode/utypes.h"

19 #if !UCONFIG_NO_BREAK_ITERATION

21 #include "unicode/utext.h"

22 #include "unicode/udata.h"

23 #include "udataswp.h"

24 #include "unicode/uobject.h"

25 #include "unicode/ustringtrie.h"

27 U_NAMESPACE_BEGIN

29 class UCharsTrie;

30 class BytesTrie;

32 class U_COMMON_API DictionaryData : public UMemory {

33 public:

34 static const int32_t TRIE_TYPE_BYTES; // = 0;

35 static const int32_t TRIE_TYPE_UCHARS; // = 1;

36 static const int32_t TRIE_TYPE_MASK; // = 7;

37 static const int32_t TRIE_HAS_VALUES; // = 8;

39 static const int32_t TRANSFORM_NONE; // = 0;

40 static const int32_t TRANSFORM_TYPE_OFFSET; // = 0x1000000;

41 static const int32_t TRANSFORM_TYPE_MASK; // = 0x7f000000;

42 static const int32_t TRANSFORM_OFFSET_MASK; // = 0x1fffff;

44 enum {

45 // Byte offsets from the start of the data, after the generic header.

46 IX_STRING_TRIE_OFFSET,

47 IX_RESERVED1_OFFSET,

48 IX_RESERVED2_OFFSET,

49 IX_TOTAL_SIZE,

51 // Trie type: TRIE_HAS_VALUES | TRIE_TYPE_BYTES etc.

52 IX_TRIE_TYPE,

53 // Transform specification: TRANSFORM_TYPE_OFFSET | 0xe00 etc.

54 IX_TRANSFORM,

56 IX_RESERVED6,

57 IX_RESERVED7,

58 IX_COUNT

59 };

60 };

62 /**

63 * Wrapper class around generic dictionaries, implementing matches().

64 * getType() should return a TRIE_TYPE_??? constant from DictionaryData.

65 *

66 * All implementations of this interface must be thread-safe if they are to be used inside of the

67 * dictionary-based break iteration code.

68 */

69 class U_COMMON_API DictionaryMatcher : public UMemory {

70 public:

71 DictionaryMatcher() {}

72 virtual ~DictionaryMatcher();

73 // this should emulate CompactTrieDictionary::matches()

74 /* @param text The text in which to look for matching words. Matching begins

75 * at the current position of the UText.

76 * @param maxLength The max length of match to consider. Units are the native indexing

77 * units of the UText.

78 * @param limit Capacity of output arrays, which is also the maximum number of

79 * matching words to be found.

80 * @param lengths output array, filled with the lengths of the matches, in order,

81 * from shortest to longest. Lengths are in native indexing units

82 * of the UText. May be NULL.

83 * @param cpLengths output array, filled with the lengths of the matches, in order,

84 * from shortest to longest. Lengths are the number of Unicode code points.

85 * May be NULL.

86 * @param values Output array, filled with the values associated with the words found.

87 * May be NULL.

88 * @param prefix Output parameter, the code point length of the prefix match, even if that

89 * prefix didn't lead to a complete word. Will always be >= the cpLength

90 * of the longest complete word matched. May be NULL.

91 * @return Number of matching words found.

92 */

     virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,

                             int32_t *lengths, int32_t *cpLengths, int32_t *values,

                             int32_t *prefix) const = 0;

97 /** @return DictionaryData::TRIE_TYPE_XYZ */

     virtual int32_t getType() const = 0;

99 };

100

101 // Implementation of the DictionaryMatcher interface for a UCharsTrie dictionary

102 class U_COMMON_API UCharsDictionaryMatcher : public DictionaryMatcher {

103 public:

104 // constructs a new UCharsDictionaryMatcher.

105 // The UDataMemory * will be closed on this object's destruction.

     UCharsDictionaryMatcher(const UChar *c, UDataMemory *f) : characters(c), file(f) { }

107 virtual ~UCharsDictionaryMatcher();

     virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,

                             int32_t *lengths, int32_t *cpLengths, int32_t *values,

                             int32_t *prefix) const;

     virtual int32_t getType() const;

112 private:

113 const UChar *characters;

114 UDataMemory *file;

115 };

116

117 // Implementation of the DictionaryMatcher interface for a BytesTrie dictionary

118 class U_COMMON_API BytesDictionaryMatcher : public DictionaryMatcher {

119 public:

120 // constructs a new BytesTrieDictionaryMatcher

121 // the transform constant should be the constant read from the file, not a masked version!

122 // the UDataMemory * fed in here will be closed on this object's destruction

     BytesDictionaryMatcher(const char *c, int32_t t, UDataMemory *f)

             : characters(c), transformConstant(t), file(f) { }

125 virtual ~BytesDictionaryMatcher();

     virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,

                             int32_t *lengths, int32_t *cpLengths, int32_t *values,

                             int32_t *prefix) const;

     virtual int32_t getType() const;

130 private:

     UChar32 transform(UChar32 c) const;

132

133 const char *characters;

134 int32_t transformConstant;

135 UDataMemory *file;

136 };

137

138 U_NAMESPACE_END

139

140 U_CAPI int32_t U_EXPORT2

 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode);

142

143 /**

144 * Format of dictionary .dict data files.

145 * Format version 1.0.

146 *

147 * A dictionary .dict data file contains a byte-serialized BytesTrie or

148 * a UChars-serialized UCharsTrie.

149 * Such files are used in dictionary-based break iteration (DBBI).

150 *

151 * For a BytesTrie, a transformation type is specified for

152 * transforming Unicode strings into byte sequences.

153 *

154 * A .dict file begins with a standard ICU data file header

155 * (DataHeader, see ucmndata.h and unicode/udata.h).

156 * The UDataInfo.dataVersion field is currently unused (set to 0.0.0.0).

157 *

158 * After the header, the file contains the following parts.

159 * Constants are defined in the DictionaryData class.

160 *

161 * For the data structure of BytesTrie & UCharsTrie see

162 * http://site.icu-project.org/design/struct/tries

163 * and the bytestrie.h and ucharstrie.h header files.

164 *

165 * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_STRING_TRIE_OFFSET]/4;

166 *

167 * The first four indexes are byte offsets in ascending order.

168 * Each byte offset marks the start of the next part in the data file,

169 * and the end of the previous one.

170 * When two consecutive byte offsets are the same, then the corresponding part is empty.

171 * Byte offsets are offsets from after the header,

172 * that is, from the beginning of the indexes[].

173 * Each part starts at an offset with proper alignment for its data.

174 * If necessary, the previous part may include padding bytes to achieve this alignment.

175 *

176 * trieType=indexes[IX_TRIE_TYPE] defines the trie type.

177 * transform=indexes[IX_TRANSFORM] defines the Unicode-to-bytes transformation.

178 * If the transformation type is TRANSFORM_TYPE_OFFSET,

179 * then the lower 21 bits contain the offset code point.

180 * Each code point c is mapped to byte b = (c - offset).

181 * Code points outside the range offset..(offset+0xff) cannot be mapped

182 * and do not occur in the dictionary.

183 *

184 * stringTrie; -- a serialized BytesTrie or UCharsTrie

185 *

186 * The dictionary maps strings to specific values (TRIE_HAS_VALUES bit set in trieType),

187 * or it maps all strings to 0 (TRIE_HAS_VALUES bit not set).

188 */

189

190 #endif /* !UCONFIG_NO_BREAK_ITERATION */

191 #endif /* __DICTIONARYDATA_H__ */