]> git.saurik.com Git - apple/icu.git/blame - icuSources/data/brkitr/line_th.txt
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / data / brkitr / line_th.txt
CommitLineData
b75a7d8f
A
1# Copyright (c) 2002, International Business Machines Corporation and
2# others. All Rights Reserved.
3#
4# file: line.txt
5#
6# Line Breaking Rules for ICU rules based break iteration.
7# Implement default line breaking as defined by Unicode TR 14.
8#
9
10
11#
12# Character Classes defined by Unicode TR 14.
13# These are generated by a script from the Unicode LineBreak derived
14# properties file.
15#
16
17############ Start of Script-Generated Definitions #######################
18
19$LF = [ \u000A];
20
21$IN = [ \u2024-\u2026];
22
23$SY = [ \u002F];
24
25$EX = [ \u0021 \u003F \u2762-\u2763 \uFE56-\uFE57 \uFF01 \uFF1F];
26
27$BA = [ \u0009 \u007C \u00AD \u058A \u0F0B \u1361 \u1680 \u17D5 \u2000-\u2006
28 \u2008-\u200A \u2010 \u2012-\u2013 \u2027 \u205F];
29
30$IS = [ \u002C \u002E \u003A-\u003B \u0589];
31
32$BB = [ \u00B4 \u02C8 \u02CC \u1806];
33
34$SA = [ \u0E01-\u0E30 \u0E32-\u0E33 \u0E40-\u0E46 \u0E81-\u0E82 \u0E84 \u0E87-\u0E88
35 \u0E8A \u0E8D \u0E94-\u0E97 \u0E99-\u0E9F \u0EA1-\u0EA3 \u0EA5
36 \u0EA7 \u0EAA-\u0EAB \u0EAD-\u0EB0 \u0EB2-\u0EB3 \u0EBD \u0EC0-\u0EC4
37 \u0EC6 \u0EDC-\u0EDD \u1000-\u1021 \u1023-\u1027 \u1029-\u102A
38 \u1050-\u1055 \u1780-\u17B3];
39
40$CB = [ \uFFFC];
41
42$XX = [ \uE000-\uF8FF \U000F0000-\U000FFFFD \U00100000-\U0010FFFD];
43
44$HY = [ \u002D];
45
46$AI = [ \u00A1 \u00A7-\u00A8 \u00AA \u00B2-\u00B3 \u00B6-\u00BA \u00BC-\u00BF
47 \u00C6 \u00D0 \u00D7-\u00D8 \u00DE-\u00E1 \u00E6 \u00E8-\u00EA
48 \u00EC-\u00ED \u00F0 \u00F2-\u00F3 \u00F7-\u00FA \u00FC \u00FE
49 \u0101 \u0111 \u0113 \u011B \u0126-\u0127 \u012B \u0131-\u0133
50 \u0138 \u013F-\u0142 \u0144 \u0148-\u014A \u014D \u0152-\u0153
51 \u0166-\u0167 \u016B \u01CE \u01D0 \u01D2 \u01D4 \u01D6 \u01D8
52 \u01DA \u01DC \u0251 \u0261 \u02C7 \u02C9-\u02CB \u02CD \u02D0
53 \u02D8-\u02DB \u02DD \u0391-\u03A1 \u03A3-\u03A9 \u03B1-\u03C1
54 \u03C3-\u03C9 \u0401 \u0410-\u044F \u0451 \u2015-\u2016 \u2020-\u2021
55 \u203B \u2074 \u207F \u2081-\u2084 \u2105 \u2113 \u2121-\u2122
56 \u212B \u2140 \u2154-\u2155 \u215B \u215E \u2160-\u216B \u2170-\u2179
57 \u2190-\u2199 \u21D2 \u21D4 \u2200 \u2202-\u2203 \u2207-\u2208
58 \u220B \u220F \u2211 \u2215 \u221A \u221D-\u2220 \u2223 \u2225
59 \u2227-\u222C \u222E \u2234-\u2237 \u223C-\u223D \u2248 \u224C
60 \u2252 \u2260-\u2261 \u2264-\u2267 \u226A-\u226B \u226E-\u226F
61 \u2282-\u2283 \u2286-\u2287 \u2295 \u2299 \u22A5 \u22BF \u2312
62 \u2460-\u24BF \u24D0-\u24E9 \u24EB-\u24FE \u2500-\u254B \u2550-\u2574
63 \u2580-\u258F \u2592-\u2595 \u25A0-\u25A1 \u25A3-\u25A9 \u25B2-\u25B3
64 \u25B6-\u25B7 \u25BC-\u25BD \u25C0-\u25C1 \u25C6-\u25C8 \u25CB
65 \u25CE-\u25D1 \u25E2-\u25E5 \u25EF \u2605-\u2606 \u2609 \u260E-\u260F
66 \u2616-\u2617 \u261C \u261E \u2640 \u2642 \u2660-\u2661 \u2663-\u2665
67 \u2667-\u266A \u266C-\u266D \u266F \uFFFD];
68
69$ZW = [ \u200B];
70
71$SG = [ \uD800-\uDFFF];
72
73$AL = [ \u0023 \u0026 \u002A \u003C-\u003E \u0040-\u005A \u005E-\u007A \u007E
74 \u00A6 \u00A9 \u00AC \u00AE-\u00AF \u00B5 \u00C0-\u00C5 \u00C7-\u00CF
75 \u00D1-\u00D6 \u00D9-\u00DD \u00E2-\u00E5 \u00E7 \u00EB \u00EE-\u00EF
76 \u00F1 \u00F4-\u00F6 \u00FB \u00FD \u00FF-\u0100 \u0102-\u0110
77 \u0112 \u0114-\u011A \u011C-\u0125 \u0128-\u012A \u012C-\u0130
78 \u0134-\u0137 \u0139-\u013E \u0143 \u0145-\u0147 \u014B-\u014C
79 \u014E-\u0151 \u0154-\u0165 \u0168-\u016A \u016C-\u01CD \u01CF
80 \u01D1 \u01D3 \u01D5 \u01D7 \u01D9 \u01DB \u01DD-\u0220 \u0222-\u0233
81 \u0250 \u0252-\u0260 \u0262-\u02AD \u02B0-\u02C6 \u02CE-\u02CF
82 \u02D1-\u02D7 \u02DC \u02DE-\u02EE \u0374-\u0375 \u037A \u037E
83 \u0384-\u038A \u038C \u038E-\u0390 \u03AA-\u03B0 \u03C2 \u03CA-\u03CE
84 \u03D0-\u03F6 \u0400 \u0402-\u040F \u0450 \u0452-\u0482 \u048A-\u04CE
85 \u04D0-\u04F5 \u04F8-\u04F9 \u0500-\u050F \u0531-\u0556 \u0559-\u055F
86 \u0561-\u0587 \u05BE \u05C0 \u05C3 \u05D0-\u05EA \u05F0-\u05F4
87 \u060C \u061B \u061F \u0621-\u063A \u0640-\u064A \u066A-\u066F
88 \u0671-\u06D5 \u06E5-\u06E6 \u06E9 \u06FA-\u06FE \u0700-\u070D
89 \u0710 \u0712-\u072C \u0780-\u07A5 \u07B1 \u0905-\u0939 \u093D
90 \u0950 \u0958-\u0961 \u0964-\u0965 \u0970 \u0985-\u098C \u098F-\u0990
91 \u0993-\u09A8 \u09AA-\u09B0 \u09B2 \u09B6-\u09B9 \u09DC-\u09DD
92 \u09DF-\u09E1 \u09F0-\u09F1 \u09F4-\u09FA \u0A05-\u0A0A \u0A0F-\u0A10
93 \u0A13-\u0A28 \u0A2A-\u0A30 \u0A32-\u0A33 \u0A35-\u0A36 \u0A38-\u0A39
94 \u0A59-\u0A5C \u0A5E \u0A72-\u0A74 \u0A85-\u0A8B \u0A8D \u0A8F-\u0A91
95 \u0A93-\u0AA8 \u0AAA-\u0AB0 \u0AB2-\u0AB3 \u0AB5-\u0AB9 \u0ABD
96 \u0AD0 \u0AE0 \u0B05-\u0B0C \u0B0F-\u0B10 \u0B13-\u0B28 \u0B2A-\u0B30
97 \u0B32-\u0B33 \u0B36-\u0B39 \u0B3D \u0B5C-\u0B5D \u0B5F-\u0B61
98 \u0B70 \u0B83 \u0B85-\u0B8A \u0B8E-\u0B90 \u0B92-\u0B95 \u0B99-\u0B9A
99 \u0B9C \u0B9E-\u0B9F \u0BA3-\u0BA4 \u0BA8-\u0BAA \u0BAE-\u0BB5
100 \u0BB7-\u0BB9 \u0BF0-\u0BF2 \u0C05-\u0C0C \u0C0E-\u0C10 \u0C12-\u0C28
101 \u0C2A-\u0C33 \u0C35-\u0C39 \u0C60-\u0C61 \u0C85-\u0C8C \u0C8E-\u0C90
102 \u0C92-\u0CA8 \u0CAA-\u0CB3 \u0CB5-\u0CB9 \u0CDE \u0CE0-\u0CE1
103 \u0D05-\u0D0C \u0D0E-\u0D10 \u0D12-\u0D28 \u0D2A-\u0D39 \u0D60-\u0D61
104 \u0D85-\u0D96 \u0D9A-\u0DB1 \u0DB3-\u0DBB \u0DBD \u0DC0-\u0DC6
105 \u0DF4 \u0E4F \u0F00-\u0F0A \u0F0D-\u0F17 \u0F1A-\u0F1F \u0F2A-\u0F34
106 \u0F36 \u0F38 \u0F40-\u0F47 \u0F49-\u0F6A \u0F85 \u0F88-\u0F8B
107 \u0FBE-\u0FC5 \u0FC7-\u0FCC \u0FCF \u104A-\u104F \u10A0-\u10C5
108 \u10D0-\u10F8 \u10FB \u1200-\u1206 \u1208-\u1246 \u1248 \u124A-\u124D
109 \u1250-\u1256 \u1258 \u125A-\u125D \u1260-\u1286 \u1288 \u128A-\u128D
110 \u1290-\u12AE \u12B0 \u12B2-\u12B5 \u12B8-\u12BE \u12C0 \u12C2-\u12C5
111 \u12C8-\u12CE \u12D0-\u12D6 \u12D8-\u12EE \u12F0-\u130E \u1310
112 \u1312-\u1315 \u1318-\u131E \u1320-\u1346 \u1348-\u135A \u1362-\u1368
113 \u1372-\u137C \u13A0-\u13F4 \u1401-\u1676 \u1681-\u169A \u16A0-\u16F0
114 \u1700-\u170C \u170E-\u1711 \u1720-\u1731 \u1735-\u1736 \u1740-\u1751
115 \u1760-\u176C \u176E-\u1770 \u17DC \u1800-\u1805 \u1807-\u180A
116 \u1820-\u1877 \u1880-\u18A8 \u1E00-\u1E9B \u1EA0-\u1EF9 \u1F00-\u1F15
117 \u1F18-\u1F1D \u1F20-\u1F45 \u1F48-\u1F4D \u1F50-\u1F57 \u1F59
118 \u1F5B \u1F5D \u1F5F-\u1F7D \u1F80-\u1FB4 \u1FB6-\u1FC4 \u1FC6-\u1FD3
119 \u1FD6-\u1FDB \u1FDD-\u1FEF \u1FF2-\u1FF4 \u1FF6-\u1FFE \u2017
120 \u2022-\u2023 \u2038 \u203D-\u2043 \u2047-\u2052 \u2057 \u2061-\u2063
121 \u2070-\u2071 \u2075-\u207C \u2080 \u2085-\u208C \u2100-\u2102
122 \u2104 \u2106-\u2108 \u210A-\u2112 \u2114-\u2115 \u2117-\u2120
123 \u2123-\u2125 \u2127-\u212A \u212C-\u213A \u213D-\u213F \u2141-\u214B
124 \u2153 \u2156-\u215A \u215C-\u215D \u215F \u216C-\u216F \u217A-\u2183
125 \u219A-\u21D1 \u21D3 \u21D5-\u21FF \u2201 \u2204-\u2206 \u2209-\u220A
126 \u220C-\u220E \u2210 \u2214 \u2216-\u2219 \u221B-\u221C \u2221-\u2222
127 \u2224 \u2226 \u222D \u222F-\u2233 \u2238-\u223B \u223E-\u2247
128 \u2249-\u224B \u224D-\u2251 \u2253-\u225F \u2262-\u2263 \u2268-\u2269
129 \u226C-\u226D \u2270-\u2281 \u2284-\u2285 \u2288-\u2294 \u2296-\u2298
130 \u229A-\u22A4 \u22A6-\u22BE \u22C0-\u2311 \u2313-\u2328 \u232B-\u23B3
131 \u23B7-\u23CE \u2400-\u2426 \u2440-\u244A \u24C0-\u24CF \u24EA
132 \u254C-\u254F \u2575-\u257F \u2590-\u2591 \u2596-\u259F \u25A2
133 \u25AA-\u25B1 \u25B4-\u25B5 \u25B8-\u25BB \u25BE-\u25BF \u25C2-\u25C5
134 \u25C9-\u25CA \u25CC-\u25CD \u25D2-\u25E1 \u25E6-\u25EE \u25F0-\u2604
135 \u2607-\u2608 \u260A-\u260D \u2610-\u2613 \u2619-\u261B \u261D
136 \u261F-\u263F \u2641 \u2643-\u265F \u2662 \u2666 \u266B \u266E
137 \u2670-\u267D \u2680-\u2689 \u2701-\u2704 \u2706-\u2709 \u270C-\u2727
138 \u2729-\u274B \u274D \u274F-\u2752 \u2756 \u2758-\u275A \u2761
139 \u2764-\u2767 \u2776-\u2794 \u2798-\u27AF \u27B1-\u27BE \u27D0-\u27E5
140 \u27F0-\u2982 \u2999-\u29D7 \u29DC-\u29FB \u29FE-\u2AFF \uFB00-\uFB06
141 \uFB13-\uFB17 \uFB1D \uFB1F-\uFB36 \uFB38-\uFB3C \uFB3E \uFB40-\uFB41
142 \uFB43-\uFB44 \uFB46-\uFBB1 \uFBD3-\uFD3D \uFD50-\uFD8F \uFD92-\uFDC7
143 \uFDF0-\uFDFB \uFE70-\uFE74 \uFE76-\uFEFC \uFF66 \uFF71-\uFF9D
144 \uFFA0-\uFFBE \uFFC2-\uFFC7 \uFFCA-\uFFCF \uFFD2-\uFFD7 \uFFDA-\uFFDC
145 \uFFE8-\uFFEE \U00010300-\U0001031E \U00010320-\U00010323 \U00010330-\U0001034A
146 \U00010400-\U00010425 \U00010428-\U0001044D \U0001D000-\U0001D0F5
147 \U0001D100-\U0001D126 \U0001D12A-\U0001D164 \U0001D16A-\U0001D16C
148 \U0001D183-\U0001D184 \U0001D18C-\U0001D1A9 \U0001D1AE-\U0001D1DD
149 \U0001D400-\U0001D454 \U0001D456-\U0001D49C \U0001D49E-\U0001D49F
150 \U0001D4A2 \U0001D4A5-\U0001D4A6 \U0001D4A9-\U0001D4AC \U0001D4AE-\U0001D4B9
151 \U0001D4BB \U0001D4BD-\U0001D4C0 \U0001D4C2-\U0001D4C3 \U0001D4C5-\U0001D505
152 \U0001D507-\U0001D50A \U0001D50D-\U0001D514 \U0001D516-\U0001D51C
153 \U0001D51E-\U0001D539 \U0001D53B-\U0001D53E \U0001D540-\U0001D544
154 \U0001D546 \U0001D54A-\U0001D550 \U0001D552-\U0001D6A3 \U0001D6A8-\U0001D7C9];
155
156$OP = [ \u0028 \u005B \u007B \u0F3A \u0F3C \u169B \u201A \u201E \u2045 \u207D
157 \u208D \u2329 \u23B4 \u2768 \u276A \u276C \u276E \u2770 \u2772
158 \u2774 \u27E6 \u27E8 \u27EA \u2983 \u2985 \u2987 \u2989 \u298B
159 \u298D \u298F \u2991 \u2993 \u2995 \u2997 \u29D8 \u29DA \u29FC
160 \u3008 \u300A \u300C \u300E \u3010 \u3014 \u3016 \u3018 \u301A
161 \u301D \uFD3E \uFE35 \uFE37 \uFE39 \uFE3B \uFE3D \uFE3F \uFE41
162 \uFE43 \uFE59 \uFE5B \uFE5D \uFF08 \uFF3B \uFF5B \uFF5F \uFF62];
163
164$BK = [ \u000C \u2028-\u2029];
165
166$PO = [ \u0025 \u00A2 \u00B0 \u2030-\u2037 \u20A7 \u2103 \u2109 \u2126 \uFDFC
167 \uFE6A \uFF05 \uFFE0];
168
169$NS = [ \u0E5A-\u0E5B \u17D4 \u17D6-\u17DA \u203C \u2044 \u3005 \u301C \u303B-\u303C
170 \u3041 \u3043 \u3045 \u3047 \u3049 \u3063 \u3083 \u3085 \u3087
171 \u308E \u3095-\u3096 \u309B-\u309E \u30A0-\u30A1 \u30A3 \u30A5
172 \u30A7 \u30A9 \u30C3 \u30E3 \u30E5 \u30E7 \u30EE \u30F5-\u30F6
173 \u30FB \u30FD \u31F0-\u31FF \uFE54-\uFE55 \uFF1A-\uFF1B \uFF65
174 \uFF67-\uFF70 \uFF9E-\uFF9F];
175
176$CL = [ \u0029 \u005D \u007D \u0F3B \u0F3D \u169C \u2046 \u207E \u208E \u232A
177 \u23B5 \u2769 \u276B \u276D \u276F \u2771 \u2773 \u2775 \u27E7
178 \u27E9 \u27EB \u2984 \u2986 \u2988 \u298A \u298C \u298E \u2990
179 \u2992 \u2994 \u2996 \u2998 \u29D9 \u29DB \u29FD \u3001-\u3002
180 \u3009 \u300B \u300D \u300F \u3011 \u3015 \u3017 \u3019 \u301B
181 \u301E-\u301F \uFD3F \uFE36 \uFE38 \uFE3A \uFE3C \uFE3E \uFE40
182 \uFE42 \uFE44 \uFE50 \uFE52 \uFE5A \uFE5C \uFE5E \uFF09 \uFF0C
183 \uFF0E \uFF3D \uFF5D \uFF60-\uFF61 \uFF63-\uFF64];
184
185$NU = [ \u0030-\u0039 \u0660-\u0669 \u06F0-\u06F9 \u0966-\u096F \u09E6-\u09EF
186 \u0A66-\u0A6F \u0AE6-\u0AEF \u0B66-\u0B6F \u0BE7-\u0BEF \u0C66-\u0C6F
187 \u0CE6-\u0CEF \u0D66-\u0D6F \u0E50-\u0E59 \u0ED0-\u0ED9 \u0F20-\u0F29
188 \u1040-\u1049 \u1369-\u1371 \u17E0-\u17E9 \u1810-\u1819 \U0001D7CE-\U0001D7FF];
189
190$CM = [ \u0000-\u0008 \u000B \u000E-\u001F \u007F-\u009F \u0300-\u034F \u0360-\u036F
191 \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9 \u05BB-\u05BD
192 \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06E4
193 \u06E7-\u06E8 \u06EA-\u06ED \u070F \u0711 \u0730-\u074A \u07A6-\u07B0
194 \u0901-\u0903 \u093C \u093E-\u094D \u0951-\u0954 \u0962-\u0963
195 \u0981-\u0983 \u09BC \u09BE-\u09C4 \u09C7-\u09C8 \u09CB-\u09CD
196 \u09D7 \u09E2-\u09E3 \u0A02 \u0A3C \u0A3E-\u0A42 \u0A47-\u0A48
197 \u0A4B-\u0A4D \u0A70-\u0A71 \u0A81-\u0A83 \u0ABC \u0ABE-\u0AC5
198 \u0AC7-\u0AC9 \u0ACB-\u0ACD \u0B01-\u0B03 \u0B3C \u0B3E-\u0B43
199 \u0B47-\u0B48 \u0B4B-\u0B4D \u0B56-\u0B57 \u0B82 \u0BBE-\u0BC2
200 \u0BC6-\u0BC8 \u0BCA-\u0BCD \u0BD7 \u0C01-\u0C03 \u0C3E-\u0C44
201 \u0C46-\u0C48 \u0C4A-\u0C4D \u0C55-\u0C56 \u0C82-\u0C83 \u0CBE-\u0CC4
202 \u0CC6-\u0CC8 \u0CCA-\u0CCD \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D43
203 \u0D46-\u0D48 \u0D4A-\u0D4D \u0D57 \u0D82-\u0D83 \u0DCA \u0DCF-\u0DD4
204 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E3A \u0E47-\u0E4E
205 \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
206 \u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F84 \u0F86-\u0F87
207 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C-\u1032 \u1036-\u1039
208 \u1056-\u1059 \u1160-\u11A2 \u11A8-\u11F9 \u1712-\u1714 \u1732-\u1734
209 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17D3 \u180B-\u180E \u18A9
210 \u200C-\u200F \u202A-\u202E \u206A-\u206F \u20D0-\u20EA \u302A-\u302F
211 \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFFF9-\uFFFB
212 \U0001D165-\U0001D169 \U0001D16D-\U0001D182 \U0001D185-\U0001D18B
213 \U0001D1AA-\U0001D1AD \U000E0001 \U000E0020-\U000E007F];
214
215$PR = [ \u0024 \u002B \u005C \u00A3-\u00A5 \u00B1 \u09F2-\u09F3 \u0E3F \u17DB
216 \u20A0-\u20A6 \u20A8-\u20B1 \u2116 \u2212-\u2213 \uFE69 \uFF04
217 \uFFE1 \uFFE5-\uFFE6];
218
219$B2 = [ \u2014];
220
221$ID = [ \u1100-\u1159 \u115F \u2E80-\u2E99 \u2E9B-\u2EF3 \u2F00-\u2FD5 \u2FF0-\u2FFB
222 \u3000 \u3003-\u3004 \u3006-\u3007 \u3012-\u3013 \u3020-\u3029
223 \u3030-\u303A \u303D-\u303F \u3042 \u3044 \u3046 \u3048 \u304A-\u3062
224 \u3064-\u3082 \u3084 \u3086 \u3088-\u308D \u308F-\u3094 \u309F
225 \u30A2 \u30A4 \u30A6 \u30A8 \u30AA-\u30C2 \u30C4-\u30E2 \u30E4
226 \u30E6 \u30E8-\u30ED \u30EF-\u30F4 \u30F7-\u30FA \u30FC \u30FE-\u30FF
227 \u3105-\u312C \u3131-\u318E \u3190-\u31B7 \u3200-\u321C \u3220-\u3243
228 \u3251-\u327B \u327F-\u32CB \u32D0-\u32FE \u3300-\u3376 \u337B-\u33DD
229 \u33E0-\u33FE \u3400-\u4DB5 \u4E00-\u9FA5 \uA000-\uA48C \uA490-\uA4C6
230 \uAC00-\uD7A3 \uF900-\uFA2D \uFA30-\uFA6A \uFE30-\uFE34 \uFE45-\uFE46
231 \uFE49-\uFE4F \uFE51 \uFE58 \uFE5F-\uFE66 \uFE68 \uFE6B \uFF02-\uFF03
232 \uFF06-\uFF07 \uFF0A-\uFF0B \uFF0D \uFF0F-\uFF19 \uFF1C-\uFF1E
233 \uFF20-\uFF3A \uFF3C \uFF3E-\uFF5A \uFF5C \uFF5E \uFFE2-\uFFE4
234 \U00020000-\U0002A6D6 \U0002F800-\U0002FA1D];
235
236$SP = [ \u0020];
237
238$QU = [ \u0022 \u0027 \u00AB \u00BB \u2018-\u2019 \u201B-\u201D \u201F \u2039-\u203A
239 \u23B6 \u275B-\u275E];
240
241$CR = [ \u000D];
242
243$GL = [ \u00A0 \u0F0C \u2007 \u2011 \u202F \u2060 \uFEFF];
244
245############ End of Script-Generated Definitions #######################
246
247
248
249#
250# Thai Dictionary related definitions and rules
251#
252
253$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English
254$paiyannoi = [\u0e2f];
255$maiyamok = [\u0e46];
256$thai_etc = $paiyannoi \u0e25 $paiyannoi;
257
258
259
260
261#
262# Character classes from TR 29. Needed for finding characters.
263#
264# $Extend is all combining characters, and none of the other cruft that
265# TR14 puts into $CM, which is its concept of combining marks.
266#
267$Extend = # From UNIDATA/DerivedCoreProperties.txt
268 [\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
269 \u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC
270 \u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A
271 \u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948
272 \u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC
273 \u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3
274 \u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
275 \u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5
276 \u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E
277 \u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57
278 \u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
279 \u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
280 \u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6
281 \u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
282 \u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1
283 \u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39
284 \u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
285 \u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84
286 \u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031
287 \u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
288 \u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
289 \u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D
290 \u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA
291 \u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
292 \U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172
293 \U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
294
295
296#
297# Rule LB1. By default, treat AI (characters with ambiguous east Asian width) and
298# SA (South East Asian: Thai, Lao, Khmer) as $AL (Alphabetic)
299#
300$ALPlus = $AL | $AI | [$SA - $dictionary];
301
302#
303# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
304# TODO: This is going to produce some odd results, because of the non-combining
305# chars that are included in $CM. Use $Extend instead, where possible.
306#
307$ALcm = $ALPlus $CM*;
308$IDcm = $ID $CM*;
309$NUcm = $NU $Extend*;
310$HYcm = $HY $Extend*;
311$SPcm = $SP $Extend*;
312$QUcm = $QU $Extend*;
313$POcm = $PO $Extend*;
314$OPcm = $OP $Extend*;
315$BAcm = $BA $Extend*;
316$BBcm = $BB $Extend*;
317$NScm = $NS $Extend*;
318$GLcm = $GL $Extend*;
319$B2cm = $B2 $Extend*;
320$INcm = $IN $Extend*;
321
322
323# New Lines. Always break after, never break before.
324# Rule LB 3
325#
326# Endings. NewLine or Zero Width Space, or both. Rules 4, 5
327# Because we never break before these things, $Endings
328# appears at the end of line break rule.
329#
330$NLF = $BK | $CR | $LF | $CR $LF;
331$Endings = $SPcm* $ZW* $NLF?;
332$EndingsMandatory = $SPcm* $NLF | $SPcm* $ZW $NLF?;
333
334
335#
336# Openings Sequences that can precede Words, and that should not be separated from them.
337# Rules LB 9, 10
338#
339$Openings = (($QUcm $SPcm*)? $OPcm $SPcm*)*;
340
341#
342# Closings Seqences that follow words, and that should not be separated from them,
343# Rule LB 8, 11, 15
344$Closings = ($SPcm*( ($CL ($SPcm* $NScm)? | $EX | $IS | $SY) $Extend*) | $BAcm | $HYcm | $NScm | $maiyamok)*;
345
346#
347# Words. Includes mixed Alpha-numerics.
348# Rules 11a, 16, 17, 19, more or less.
349#
350$NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+;
351$Number = $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?; # Fancy Number 18
352$Word = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?)); # Alpha-numeric. 16, 17
353$Dashes = (($B2cm $SPcm*)*); # Dashes 11a
354$ThaiRange = $dictionary+ | $thai_etc;
355$WordLikeThing = $Number | $Word | $Dashes | $ThaiRange;
356
357
358
359
360$Word15 = ($BBcm* ($WordLikeThing)? ($BAcm | $HYcm | $NScm)*) | # Rule 15. Stuff sticks around words.
361 [^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend* | # Allow characters that don't meet the
362 [^$BK $CR $LF $ZW $SP $GL ]; # more elaborate definitions for WORD
363 # to be glued.
364
365$GluedWord = ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together.
366 # Rules 13, 14
367
368#
369# The actual rules, a combination of everything defined above.
370#
371$Openings $GluedWord $Closings $paiyannoi? $EndingsMandatory;
372$Openings $GluedWord $Closings $Endings;
373
374$Openings $GluedWord $Closings $paiyannoi /
375 ([^\u0e25 $Extend] | \u0e25[^$paiyannoi $Extend]);
376
377
378 #"$word($nbsp+$word)*$paiyannoi/([^[\u0e25$_ignore_]]|"
379 # + "\u0e25[^$paiyannoi$_ignore_]);"
380
381
382#
383# Reverse Rules.
384#
385# Back up to a hard break or a space that will cause a boundary.
386# Not all spaces cause line breaks. $SpaceGlue represents a sequence
387# containing a space that may inhibit a break from occuring.
388#
389$SpaceGlue = ([$ZW $CL $IS $NS $OP] ($Extend* $SP)) | (($Extend* $SP)+ $OP);
390$ClumpingChars = [^$SP $BK $CR $LF];
391
392!. . $ClumpingChars* ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);
393