1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 1999-2014 International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: rbbidata.h
12 * tab size: 8 (not used)
15 * RBBI data formats Includes
17 * Structs that describes the format of the Binary RBBI data,
18 * as it is stored in ICU's data file.
20 * RBBIDataWrapper - Instances of this class sit between the
21 * raw data structs and the RulesBasedBreakIterator objects
22 * that are created by applications. The wrapper class
23 * provides reference counting for the underlying data,
24 * and direct pointers to data that would not otherwise
25 * be accessible without ugly pointer arithmetic. The
26 * wrapper does not attempt to provide any higher level
27 * abstractions for the data itself.
29 * There will be only one instance of RBBIDataWrapper for any
30 * set of RBBI run time data being shared by instances
31 * (clones) of RulesBasedBreakIterator.
34 #ifndef __RBBIDATA_H__
35 #define __RBBIDATA_H__
37 #include "unicode/utypes.h"
38 #include "unicode/udata.h"
42 * Swap RBBI data. See udataswp.h.
45 U_CAPI
int32_t U_EXPORT2
46 ubrk_swap(const UDataSwapper
*ds
,
47 const void *inData
, int32_t length
, void *outData
,
48 UErrorCode
*pErrorCode
);
52 #include "unicode/uobject.h"
53 #include "unicode/unistr.h"
54 #include "unicode/uversion.h"
60 // The current RBBI data format version.
61 static const uint8_t RBBI_DATA_FORMAT_VERSION
[] = {5, 0, 0, 0};
64 * The following structs map exactly onto the raw data from ICU common data file.
66 struct RBBIDataHeader
{
67 uint32_t fMagic
; /* == 0xbla0 */
68 UVersionInfo fFormatVersion
; /* Data Format. Same as the value in struct UDataInfo */
69 /* if there is one associated with this data. */
70 /* (version originates in rbbi, is copied to UDataInfo) */
71 uint32_t fLength
; /* Total length in bytes of this RBBI Data, */
72 /* including all sections, not just the header. */
73 uint32_t fCatCount
; /* Number of character categories. */
76 /* Offsets and sizes of each of the subsections within the RBBI data. */
77 /* All offsets are bytes from the start of the RBBIDataHeader. */
78 /* All sizes are in bytes. */
80 uint32_t fFTable
; /* forward state transition table. */
82 uint32_t fRTable
; /* Offset to the reverse state transition table. */
84 uint32_t fTrie
; /* Offset to Trie data for character categories */
86 uint32_t fRuleSource
; /* Offset to the source for for the break */
87 uint32_t fRuleSourceLen
; /* rules. Stored UChar *. */
88 uint32_t fStatusTable
; /* Offset to the table of rule status values */
89 uint32_t fStatusTableLen
;
91 uint32_t fReserved
[6]; /* Reserved for expansion */
97 struct RBBIStateTableRow
{
98 int16_t fAccepting
; /* Non-zero if this row is for an accepting state. */
99 /* Value 0: not an accepting state. */
100 /* -1: Unconditional Accepting state. */
101 /* positive: Look-ahead match has completed. */
102 /* Actual boundary position happened earlier */
103 /* Value here == fLookAhead in earlier */
104 /* state, at actual boundary pos. */
105 int16_t fLookAhead
; /* Non-zero if this row is for a state that */
106 /* corresponds to a '/' in the rule source. */
107 /* Value is the same as the fAccepting */
108 /* value for the rule (which will appear */
109 /* in a different state. */
110 int16_t fTagIdx
; /* Non-zero if this row covers a {tagged} position */
111 /* from a rule. Value is the index in the */
112 /* StatusTable of the set of matching */
113 /* tags (rule status values) */
115 uint16_t fNextState
[1]; /* Next State, indexed by char category. */
116 /* Variable-length array declared with length 1 */
117 /* to disable bounds checkers. */
118 /* Array Size is actually fData->fHeader->fCatCount*/
119 /* CAUTION: see RBBITableBuilder::getTableSize() */
120 /* before changing anything here. */
124 struct RBBIStateTable
{
125 uint32_t fNumStates
; /* Number of states. */
126 uint32_t fRowLen
; /* Length of a state table row, in bytes. */
127 uint32_t fFlags
; /* Option Flags for this state table */
128 uint32_t fReserved
; /* reserved */
129 char fTableData
[1]; /* First RBBIStateTableRow begins here. */
130 /* Variable-length array declared with length 1 */
131 /* to disable bounds checkers. */
132 /* (making it char[] simplifies ugly address */
133 /* arithmetic for indexing variable length rows.) */
137 RBBI_LOOKAHEAD_HARD_BREAK
= 1,
138 RBBI_BOF_REQUIRED
= 2
139 } RBBIStateTableFlags
;
143 /* The reference counting wrapper class */
145 class RBBIDataWrapper
: public UMemory
{
150 RBBIDataWrapper(const RBBIDataHeader
*data
, UErrorCode
&status
);
151 RBBIDataWrapper(const RBBIDataHeader
*data
, enum EDontAdopt dontAdopt
, UErrorCode
&status
);
152 RBBIDataWrapper(UDataMemory
* udm
, UErrorCode
&status
);
155 static UBool
isDataVersionAcceptable(const UVersionInfo version
);
158 void init(const RBBIDataHeader
*data
, UErrorCode
&status
);
159 RBBIDataWrapper
*addReference();
160 void removeReference();
161 UBool
operator ==(const RBBIDataWrapper
&other
) const;
163 const UnicodeString
&getRuleSourceString() const;
165 void printTable(const char *heading
, const RBBIStateTable
*table
);
168 /* Pointers to items within the data */
170 const RBBIDataHeader
*fHeader
;
171 const RBBIStateTable
*fForwardTable
;
172 const RBBIStateTable
*fReverseTable
;
173 const UChar
*fRuleSource
;
174 const int32_t *fRuleStatusTable
;
176 /* number of int32_t values in the rule status table. Used to sanity check indexing */
177 int32_t fStatusMaxIdx
;
182 u_atomic_int32_t fRefCount
;
183 UDataMemory
*fUDataMem
;
184 UnicodeString fRuleString
;
187 RBBIDataWrapper(const RBBIDataWrapper
&other
); /* forbid copying of this class */
188 RBBIDataWrapper
&operator=(const RBBIDataWrapper
&other
); /* forbid copying of this class */