]> git.saurik.com Git - apple/icu.git/blob - icuSources/data/brkitr/word_th.txt
ICU-6.2.13.tar.gz
[apple/icu.git] / icuSources / data / brkitr / word_th.txt
1 # Copyright (c) 2002-2003, International Business Machines Corporation and
2 # others. All Rights Reserved.
3 #
4 # word.txt Word Breaking Rules for ICU Rules Based Break Iterator.
5 #
6
7
8 $Hiragana = [[:L:] & [:Hira:]];
9 $Katakana = [[:L:] & [:Kana:]];
10
11 #
12 # Definition of $Ideographic is from TR14, Line Breaking.
13 #
14 $Ideographic =
15 [ \u1100-\u1159 \u115F \u2E80-\u2E99 \u2E9B-\u2EF3 \u2F00-\u2FD5 \u2FF0-\u2FFB
16 \u3000 \u3003-\u3004 \u3006-\u3007 \u3012-\u3013 \u3020-\u3029
17 \u3030-\u303A \u303D-\u303F \u3042 \u3044 \u3046 \u3048 \u304A-\u3062
18 \u3064-\u3082 \u3084 \u3086 \u3088-\u308D \u308F-\u3094 \u309F
19 \u30A2 \u30A4 \u30A6 \u30A8 \u30AA-\u30C2 \u30C4-\u30E2 \u30E4
20 \u30E6 \u30E8-\u30ED \u30EF-\u30F4 \u30F7-\u30FA \u30FC \u30FE-\u30FF
21 \u3105-\u312C \u3131-\u318E \u3190-\u31B7 \u3200-\u321C \u3220-\u3243
22 \u3251-\u327B \u327F-\u32CB \u32D0-\u32FE \u3300-\u3376 \u337B-\u33DD
23 \u33E0-\u33FE \u3400-\u4DB5 \u4E00-\u9FA5 \uA000-\uA48C \uA490-\uA4C6
24 \uAC00-\uD7A3 \uF900-\uFA2D \uFA30-\uFA6A \uFE30-\uFE34 \uFE45-\uFE46
25 \uFE49-\uFE4F \uFE51 \uFE58 \uFE5F-\uFE66 \uFE68 \uFE6B \uFF02-\uFF03
26 \uFF06-\uFF07 \uFF0A-\uFF0B \uFF0D \uFF0F-\uFF19 \uFF1C-\uFF1E
27 \uFF20-\uFF3A \uFF3C \uFF3E-\uFF5A \uFF5C \uFF5E \uFFE2-\uFFE4
28 \U00020000-\U0002A6D6 \U0002F800-\U0002FA1D];
29
30 #
31 # These definitions are from the character break rules.
32 #
33 $CGJ = [\u034f]; #Combining Grapheme Joiner
34 $Link = [\u094D \u09CD \u0A4D \u0ACD \u0B4D \u0BCD \u0C4D \u0CCD \u0D4D \u0DCA \u0E3A \u1039 \u17D2];
35 $NotControl = [^[:Zl:] [:Zp:] [:Cc:]]; #Line Separator, Paragraph Separtor, General Category == Control
36
37 $Extend = # From UNIDATA/DerivedCoreProperties.txt
38 [\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
39 \u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC
40 \u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A
41 \u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948
42 \u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC
43 \u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3
44 \u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
45 \u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5
46 \u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E
47 \u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57
48 \u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
49 \u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
50 \u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6
51 \u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
52 \u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1
53 \u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39
54 \u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
55 \u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84
56 \u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031
57 \u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
58 \u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
59 \u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D
60 \u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA
61 \u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
62 \U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172
63 \U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
64
65 #
66 # Korean, also taken from character break rules.
67 #
68 #
69 # Korean Syllable Sequences
70 #
71 $L = [\u1100-\u115f];
72 $V = [\u1160-\u11a2];
73 $T = [\u11a8-\u11f9];
74 $LV = [ \uac00 \uac1c \uac38 \uac54 \uac70 \uac8c \uaca8 \uacc4 \uace0 \uacfc \uad18 \uad34 \uad50 \uad6c \uad88 \uada4
75 \uadc0 \uaddc \uadf8 \uae14 \uae30 \uae4c \uae68 \uae84 \uaea0 \uaebc \uaed8 \uaef4 \uaf10 \uaf2c \uaf48 \uaf64
76 \uaf80 \uaf9c \uafb8 \uafd4 \uaff0 \ub00c \ub028 \ub044 \ub060 \ub07c \ub098 \ub0b4 \ub0d0 \ub0ec \ub108 \ub124
77 \ub140 \ub15c \ub178 \ub194 \ub1b0 \ub1cc \ub1e8 \ub204 \ub220 \ub23c \ub258 \ub274 \ub290 \ub2ac \ub2c8 \ub2e4
78 \ub300 \ub31c \ub338 \ub354 \ub370 \ub38c \ub3a8 \ub3c4 \ub3e0 \ub3fc \ub418 \ub434 \ub450 \ub46c \ub488 \ub4a4
79 \ub4c0 \ub4dc \ub4f8 \ub514 \ub530 \ub54c \ub568 \ub584 \ub5a0 \ub5bc \ub5d8 \ub5f4 \ub610 \ub62c \ub648 \ub664
80 \ub680 \ub69c \ub6b8 \ub6d4 \ub6f0 \ub70c \ub728 \ub744 \ub760 \ub77c \ub798 \ub7b4 \ub7d0 \ub7ec \ub808 \ub824
81 \ub840 \ub85c \ub878 \ub894 \ub8b0 \ub8cc \ub8e8 \ub904 \ub920 \ub93c \ub958 \ub974 \ub990 \ub9ac \ub9c8 \ub9e4
82 \uba00 \uba1c \uba38 \uba54 \uba70 \uba8c \ubaa8 \ubac4 \ubae0 \ubafc \ubb18 \ubb34 \ubb50 \ubb6c \ubb88 \ubba4
83 \ubbc0 \ubbdc \ubbf8 \ubc14 \ubc30 \ubc4c \ubc68 \ubc84 \ubca0 \ubcbc \ubcd8 \ubcf4 \ubd10 \ubd2c \ubd48 \ubd64
84 \ubd80 \ubd9c \ubdb8 \ubdd4 \ubdf0 \ube0c \ube28 \ube44 \ube60 \ube7c \ube98 \ubeb4 \ubed0 \ubeec \ubf08 \ubf24
85 \ubf40 \ubf5c \ubf78 \ubf94 \ubfb0 \ubfcc \ubfe8 \uc004 \uc020 \uc03c \uc058 \uc074 \uc090 \uc0ac \uc0c8 \uc0e4
86 \uc100 \uc11c \uc138 \uc154 \uc170 \uc18c \uc1a8 \uc1c4 \uc1e0 \uc1fc \uc218 \uc234 \uc250 \uc26c \uc288 \uc2a4
87 \uc2c0 \uc2dc \uc2f8 \uc314 \uc330 \uc34c \uc368 \uc384 \uc3a0 \uc3bc \uc3d8 \uc3f4 \uc410 \uc42c \uc448 \uc464
88 \uc480 \uc49c \uc4b8 \uc4d4 \uc4f0 \uc50c \uc528 \uc544 \uc560 \uc57c \uc598 \uc5b4 \uc5d0 \uc5ec \uc608 \uc624
89 \uc640 \uc65c \uc678 \uc694 \uc6b0 \uc6cc \uc6e8 \uc704 \uc720 \uc73c \uc758 \uc774 \uc790 \uc7ac \uc7c8 \uc7e4
90 \uc800 \uc81c \uc838 \uc854 \uc870 \uc88c \uc8a8 \uc8c4 \uc8e0 \uc8fc \uc918 \uc934 \uc950 \uc96c \uc988 \uc9a4
91 \uc9c0 \uc9dc \uc9f8 \uca14 \uca30 \uca4c \uca68 \uca84 \ucaa0 \ucabc \ucad8 \ucaf4 \ucb10 \ucb2c \ucb48 \ucb64
92 \ucb80 \ucb9c \ucbb8 \ucbd4 \ucbf0 \ucc0c \ucc28 \ucc44 \ucc60 \ucc7c \ucc98 \uccb4 \uccd0 \uccec \ucd08 \ucd24
93 \ucd40 \ucd5c \ucd78 \ucd94 \ucdb0 \ucdcc \ucde8 \uce04 \uce20 \uce3c \uce58 \uce74 \uce90 \uceac \ucec8 \ucee4
94 \ucf00 \ucf1c \ucf38 \ucf54 \ucf70 \ucf8c \ucfa8 \ucfc4 \ucfe0 \ucffc \ud018 \ud034 \ud050 \ud06c \ud088 \ud0a4
95 \ud0c0 \ud0dc \ud0f8 \ud114 \ud130 \ud14c \ud168 \ud184 \ud1a0 \ud1bc \ud1d8 \ud1f4 \ud210 \ud22c \ud248 \ud264
96 \ud280 \ud29c \ud2b8 \ud2d4 \ud2f0 \ud30c \ud328 \ud344 \ud360 \ud37c \ud398 \ud3b4 \ud3d0 \ud3ec \ud408 \ud424
97 \ud440 \ud45c \ud478 \ud494 \ud4b0 \ud4cc \ud4e8 \ud504 \ud520 \ud53c \ud558 \ud574 \ud590 \ud5ac \ud5c8 \ud5e4
98 \ud600 \ud61c \ud638 \ud654 \ud670 \ud68c \ud6a8 \ud6c4 \ud6e0 \ud6fc \ud718 \ud734 \ud750 \ud76c \ud788 ];
99 $LVT = [[\uac00-\ud7a3] - $LV];
100 $Hangul_Sequence = ((($L+ $LV?) | ($L* $LV)) $V* $T* ) | ($L* $LVT $T*);
101
102
103 #
104 # Thai Dictionary Related Rules
105 #
106 $dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English
107 $paiyannoi = [\u0e2f];
108 $maiyamok = [\u0e46];
109 $thai_etc = $paiyannoi \u0e25 $paiyannoi;
110
111
112 $dictionary+ ($paiyannoi? $maiyamok)?;
113 $dictionary+ $paiyannoi / ([^\u0e25 $maiyamok $Extend] | \u0e25[^$paiyannoi $Extend]);
114 $thai_etc;
115
116
117 #
118 # Definitions for building up Letters, so that breaks will not occur
119 # within a single letter (Grapheme Cluster). See the character break rules.
120 #
121 $LineBreak = [$Ideographic $Hiragana $Katakana];
122 $Letter = [[[:L:] [:Sk:]] & [^$LineBreak $dictionary]];
123 #$MidLetter = [\u0027 \u2019 \u0029 \u00ad \u05f3 \u05f4];
124 $MidLetter = [\u0027 \u2019 \u003a \u0029 \u00ad \u05f3 \u05f4];
125
126 $Base = [^[:Cc:] [:Cf:] [:Cs:] [:Co:] [:Cn:] [:Zl:] [:Zp:] $Extend $Link $CGJ];
127 $LetterBase = [:L:];
128 $CGJSequence = $CGJ+ ($Base | $Hangul_Sequence);
129 $Join_Control = [\u200d-\u200e]; # Zero Width Joiner, Zero Width Non-Joiner
130 $LinkSequence = $Link+ $Extend* $Join_Control? $LetterBase;
131 $LetterEx = ($Letter | $Hangul_Sequence) $Extend* ((($LinkSequence | $CGJSequence) $Extend*)*);
132
133
134
135 #
136 # Numeric Definitions
137 # TODO: More complete handling of $Extend combining chars.
138 #
139 $Numeric = [:Nd:]; #TODO remove FULL WIDTH
140 $NumericEx = $Numeric $Extend*;
141 $InfixNumeric = [\u002c \u002e \u003a \u003b \u0589];
142 $PostfixNumeric = [\% \u00a2 \u00b0 \u2030 \u2031 \u2032-\u2037 \u20a7
143 \u2103 \u2109 \u2126 \ufe6a \uff05 \uffe0];
144 $PrefixNumeric = [[[:Sc:] \u002b \u005c \u00b1 \u2116 \u2212 \u2213 \-] - [$PostfixNumeric]];
145
146 $NumericPrefix = $PrefixNumeric $NumericEx ($InfixNumeric $NumericEx)?;
147 $NumericInterior = $NumericEx ($InfixNumeric? $NumericEx)*;
148
149
150 #
151 # The Big Rule. Gloms everything together.
152 #
153 $NumericPrefix? (($LetterEx ($MidLetter $LetterEx)*)? $NumericInterior?)* ($NumericInterior $PostfixNumeric)?;
154
155 #
156 # Lesser rules
157 #
158 ($Hiragana $Extend*)*;
159 ($Katakana $Extend*)*;
160 $NotControl $Extend*;
161 \r\n;
162 .;
163
164 #
165 # Reverse Rules. Back up over any of the chars that can group together.
166 # (Reverse rules do not need to be exact; they can back up a bit too far,
167 # but must back up at least enough.)
168 #
169 ! ( $Letter | $MidLetter | $Numeric | $PrefixNumeric | $Join_Control |
170 $CGJ | $Link | $InfixNumeric | $PostfixNumeric | $Extend |
171 $T | $V | $L | $LV | $LVT)*;
172 ! ($Hiragana | $Extend)*;
173 ! ($Katakana | $Extend)*;
174 ! $Extend* .;
175 ! \n\r;
176 #!.*;
177
178 ! ($dictionary | $paiyannoi | $maiyamok | \u0e25)*;