]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/testdata/rbbitst.txt
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / test / testdata / rbbitst.txt
CommitLineData
73c04bcf 1# Copyright (c) 2001-2006 International Business Machines
b75a7d8f
A
2# Corporation and others. All Rights Reserved.
3#
4# RBBI Test Data
5#
6# File: rbbitst.txt
7#
8# The format of this file looks vaguely like some kind of xml-ish markup,
9# but it is NOT. The syntax is this..
10#
11# <word> any following data is for word break testing
12# <sent> any following data is for sentence break testing
13# <line> any following data is for line break testing
14# <char> any following data is for char break testing
73c04bcf 15# <locale local_name> Switch to the named locale at the next occurence of <word>, <sent>, etc.
b75a7d8f
A
16# <data> ... </data> test data. May span multiple lines.
17# <> Break position, status == 0
18# • Break position, status == 0 (Bullet, \u2022)
19# <nnn> Break position, status == nnn
20# \ Escape. Normal ICU unescape applied.
21# \ at end of line -> Line Continuation. Remove both the backslash and the new line
22#
23#
24
25
73c04bcf 26# Temp debugging tests
374ca955
A
27<line>
28# to test for bug #4097920
29<data>•dog,cat,mouse •(one)•(two)\n<100></data>
b75a7d8f
A
30
31########################################################################################
32#
33#
34# G r a p h e m e C l u s t e r T e s t s
35#
36#
37##########################################################################################
38<char>
39
40<data>•a•b•c• •,•\u0666•</data> # Quick Test
41<data>•\r•\r•\r\n•\r\n•\n•\r•</data> # don't break CR/LF
42
43# Always break after controls. Combining chars don't combine with them.
44<data>•\u0003•\N{COMBINING GRAVE ACCENT}•\r•\N{COMBINING GRAVE ACCENT}•</data>
45<data>•\u0085•\N{COMBINING MACRON}•A\N{COMBINING MACRON}•</data>
46
47# Surrogates
48<data>•\U00011000•\U00010020•\U00010000\N{COMBINING MACRON}•</data>
49<data>•\ud800\udc00•\udbff\udfff•a•</data>
50
51# Extend (Combining chars) combine.
52<data>•A\N{COMBINING GRAVE ACCENT}•B•</data>
53<data>•\N{GREEK SMALL LETTER MU}\N{COMBINING LOW LINE}\N{COMBINING HORN}•</data>
54<data>•a\u0301•b\u0302•c\u0303•d\u0304•e\u0305•f\u0306•g\u0307•h\u0308•i\u0309•</data>
55
56<data>•a\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304•</data>
57
58# Don't break Hangul Syllables
59# L : \u1100
60# V : \u1161
61# T : \u11A8
62# LV : \uAC00
63# LVT : \uAC01
64
65<data>•\u1100\u1161\u11a8•\u1100\u1161\u11a8•</data> #LVT
66<data>•\u1100\u1161•\u1100\u1161•</data>
67<data>•\u1100\u1161\u11a8•\u1161•\u1100•\u11a8•\u1161\u1161\u1161\u11a8•</data>
68<data>•\u1100\u1100\uac01•\u1100\uac01•\u1100\uac01\u0301•\uac01•</data>
69<data>•\u1100\u0301•\u1161\u11a8\u0301•\u11a8•</data>
70
71
72
73# Hindi combining chars. (An old test)
74<data>•भ••ा•\u0930•\u0924• •\u0938\u0941\u0902•\u0926•\u0930•
75•\u0939•\u094c•\u0964•</data>
76<data>•\u0916\u0947•\u0938\u0941\u0902•\u0926•\u0930•\u0939•\u094c•\u0964•</data>
77
78
79# Bug 1587. Tamil. \u0baa\u0bc1 should be two separate characters, even though
80# Hyangmi would perfer that it be one.
81<data>•\u0baa•\u0bc1•\u0baa•\u0bc1•</data>
82
83# Regression test for bug 1889
84<data>•\u0f40\u0f7d•\u0000•\u0f7e•</data>
85
86
87# 0xffff is a legal character, and should not stop the break iterator early.
88# (Requires special casing in implementation, which is why it gets a test.)
89<data>•\uffff•\uffff• •a•</data>
90
73c04bcf
A
91# Treat Japanese Half Width voicing marks as combining
92<data>•A\uff9e•B\uff9f\uff9e\uff9f•C•</data>
b75a7d8f
A
93
94########################################################################################
95#
96#
97# W o r d B o u n d a r y T e s t s
98#
99#
100##########################################################################################
101
102<word>
103#
104# Quick sanity test
105#
106<data>•hello<200> •there<200> •goodbye<200></data>
107<data>•hello<200> •12345<100> •,•</data>
108
109
110#
111# Test data originally in RBBIAPITest::TestFirstNextFollowing() and TestLastPreviousPreceding()
112#
113
114<word>
115<data>•This<200> •is<200> •a<200> •word<200> •break<200>.• • •Isn't<200> •it<200>?• •2.25<100></data>
116
117
118
119#
120# Data originally from TestDefaultRuleBasedWordIteration()
121#
122<data>•Write<200> •wordrules<200>.• •123.456<100> •alpha\u00adbeta\u00adgamma<200> •\u092f\u0939<200> •</data>
123<data>• •\u0939\u093f\u0928\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0905\u093e\u092a<200> •\u0938\u093f\u0916\u094b\u0917\u0947<200>?•</data>
124
125#Hindi Numbers
126<data>• •\u0968\u0966.\u0969\u096f<100> •\u0967\u0966\u0966.\u0966\u0966<100> •\N{RUPEE SIGN}•\u0967,\u0967\u0966\u0966.\u0966\u0966<100> • •\u0905\u092e\u091c<200>\n•</data>
127
128<data>•\u0938\u094d\u200d\u0935\u0924\u0902deadTA\u0930<200>\r•It's<200> •$•30.10<100> •12,34<100>¢•£•¤•¥•alpha\u05f3beta\u05f4gamma<200> •</data>
129
130<data>•Badges<200>?• •BADGES<200>!•?•!• •We<200> •don't<200> •need<200> •no<200> •STINKING<200> •BADGES<200>!•!•1000,233,456.000<100> •1,23.322<100>%•123.1222<100>$•123,000.20<100> •179.01<100>%•X<200> •Now<200>\r•is<200>\n•the<200>\r\n•time<200> •</data>
131
132#Hangul
133<data>•\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •Hello<200>,• •how<200> •are<200> •you<200> •</data>
134
135
136# Words containing non-BMP letters
137<data>•abc\U00010300<200> •abc\N{DESERET SMALL LETTER ENG}<200> •abc\N{MATHEMATICAL BOLD SMALL Z}<200> •abc\N{MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL}<200> •</data>
138
139# Unassigned code points
140<data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>
141
142# Hiragana & Katakana stay together, but separates from each other and Latin.
143<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>
144
145# Words with interior formatting characters
146<data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data>
147
148# to test for bug #4097779
149<data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
150
151
152# to test for bug #4098467
153# What follows is a string of Korean characters (I found it in the Yellow Pages
154# ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
155# it correctly), first as precomposed syllables, and then as conjoining jamo.
156# Both sequences should be semantically identical and break the same way.
157# precomposed syllables...
158<data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>
159
160<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>
161
73c04bcf 162<data>•\u06c9\uc799\ufffa<200></data>
b75a7d8f
A
163
164#
165# Try some words from other scripts.
166#
167
168# Try some words from other scripts.
169# Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin
170#
171<data>•ΑΒΓ<200> •БВГ<200> •אבג֓<200> •ابت<200> •١٢٣<100> •\u10A0\u10A1\u10A2<200> •ABC<200> •</data>
172
173<data>•\u0301•A<200></data>
174
175
176#
177# Hindi word break tests, imported from the old RBBI tests.
178# An historical note: a much earlier version of ICU break iterators had a number
179# of special case rules for Hindi, which were tested by an earlier version of
180# this test data. The current RBBI rules do not special case Hindi in
181# any way, making this test data much less signfificant.
182#
183<data>•\u0917\u092a\u00ad\u0936\u092a<200>!•\u092f\u0939<200> •\u0939\u093f\u0928\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0905\u093e\u092a<200> •\u0938\u093f\u0916\u094b\u0917\u0947<200>?•\n•:•\u092a\u094d\u0930\u093e\u092f\u0903<200>
184•\u0935\u0930\u094d\u0937\u093e<200>\r\n•\u092a\u094d\u0930\u0915\u093e\u0936<200>,•\u0924\u0941\u092e\u093e\u0930\u094b<200> •\u092e\u093f\u0924\u094d\u0930<200> •\u0915\u093e<200> •\u092a\u0924\u094d\u0930<200> •\u092a\u095d\u094b<200> •\u0938\u094d\u0924\u094d\u0930\u093f<200>.• •\u0968\u0966.\u0969\u096f<100> •\u0967\u0966\u0966.\u0966\u0966<100>\u20a8•\u0967,\u0967\u0966\u0966.\u0966\u0966<100> •\u0905\u092e\u091c<200>\n•\u0938\u094d\u200d\u0935\u0924\u0902\u0924\u094d\u0930<200>\r•</data>
185
374ca955
A
186#
187# Failures from monkey tests
188#
189<data>•\u8527<400>\u02ba<200>\u0027\u0d42•\u00b7•\u09ea<100></data>
190
73c04bcf
A
191#
192# Jitterbug 5276 - treat Japanese half width voicing marks as Grapheme Extend
193#
194<data>•A\uff9e\uff9fBC<200> •1\uff9e\uff9f23<100></data>
195
b75a7d8f
A
196########################################################################################
197#
198#
199# S e n t e n c e B o u n d a r y T e s t s
200#
201#
202##########################################################################################
203
204
205#
206# Test data originally from RBBI RBBITest::TestDefaultRuleBasedSentenceIteration()
207#
208<sent>
209
210
211<sent>
374ca955 212<data>•This\n<100></data>
b75a7d8f 213<data>•Hello! •how are you? •I'am fine. •Thankyou. •How are you \
374ca955 214doing? •This\n<100> costs $20,00,000. •</data>
b75a7d8f
A
215
216
217# Sentence ending in a quote.
218<data>•"Sentence ending with a quote." •Bye.•</data>
219
220# Sentence, and test data, ending without a period or other terminator.
374ca955 221<data>•Here is a random sentence, no ending period<100></data>
b75a7d8f
A
222
223
224<data>• (This is it). •Testing the sentence iterator. •\
225"This isn't it." •Hi! \
226•This is a simple sample sentence. •(This is it.) •This is a simple sample sentence. •\
227"This isn't it." •\
73c04bcf
A
228Hi! •This is a simple sample sentence. •It does not have to make any sense as you can see. •Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. •Che la dritta via aveo smarrita. •He said, that I said, that you said!! •Don't rock the boat.\u2029•Because I am the daddy, that is why.
229•Not on my time (el timo.)! •</data>
b75a7d8f 230
73c04bcf 231<data>•Hello. •So what!!\u2029•"But now," he said, \
b75a7d8f 232"I know!" •\
73c04bcf 233Harris thumbed down several, including "Away We Go" (which became the huge success Oklahoma!). •One species, B. anthracis, is highly virulent.
b75a7d8f
A
234•Wolf said about Sounder:\
235"Beautifully thought-out and directed." •\
236Have you ever said, "This is where\tI shall live"? •He answered, \
73c04bcf 237"You may not!" •Another popular saying is: "How do you do?". \n•\
b75a7d8f
A
238Yet another popular saying is: \
239'I'm fine thanks.' •\
73c04bcf 240What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tall!!\
374ca955 241•Now\r<100>is\n<100>the\r\n<100>time\n<100>\r<100>for\r<100>\r<100></data>
b75a7d8f
A
242
243<data>•No breaks when . is surrounded by UPPER.Case letters. •</data>
244<data>•No breaks when . is followed by Numeric .4 a.4 C.4 3.1 .•</data>
245<data>•No breaks when . is followed by a lower, with possible intervening punct .,a .$a .)a. •</data>
246
247#
248# Sentence Breaks: no break at the boundary between CJK and other letters
249#
250<data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:"JAVA\u821c\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46".\u2029•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u3002•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4\u6470\u8790JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2048•He said, "I can go there."\u2029•Bye, now.•</data>
251
252#
253# Treat fullwidth variants of .!? the same as their
254# normal counterparts
255#
73c04bcf 256<data>•I know I'm right\uff0e •Right\uff1f •Right\uff01 •</data>
b75a7d8f
A
257
258
259#
260# Don't break sentences at boundary between CJK and digits
261#
73c04bcf 262<data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e48888\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u3002•Bye, now<100></data>
b75a7d8f
A
263
264#
265# Breaks around '(' following a sentence TERM. (Rule 9)
266#
267<data>•How do you do?(•Fine). •</data>
268<data>•How do you do? •(Fine). •</data>
269<data>•How do you do?(•fine). •</data>
270<data>•How do you do? •(fine). •</data>
271
272#
374ca955
A
273<data>•Hello.123<100></data> # Rule 6
274<data>•Hello?•123<100></data>
b75a7d8f 275
374ca955
A
276<data>•HELLO.Bye<100></data> # Rule 7
277<data>•HELLO?•Bye<100></data>
b75a7d8f 278
374ca955
A
279<data>•Hello.goodbye<100></data> #Rule 8
280<data>•Hello. •Goodbye<100></data>
281<data>•Hello. goodbye<100></data>
b75a7d8f
A
282
283
284
285#
286# test for bug #4158381: No breaks when there are no terminators around
287#
288<data>•\<P>Provides a set of &quot;lightweight&quot; (all-java\<FONT SIZE="-2">\<SUP>TM\</SUP>\</FONT> language) components that, to the maximum degree possible, work the same on all platforms. •</data>
289<data>•Another test.\u2029•</data>
290
291# test for bug #4143071: Make sure sentences that end with digits
292# work right
293#
294<data>•Today is the 27th of May, 1998. •</data>
295<data>•Tomorrow with be 28 May 1998. •</data>
296<data>•The day after will be the 30th.\u2029•</data>
297
298# test for bug #4152416: Make sure sentences ending with a capital
299# letter are treated correctly
300#
301<data>•The type of all primitive \<code>boolean\</code> values accessed in the target VM. •Calls to xxx will return an implementor of this interface. \u2029•</data>
302
303# test for bug #4152117: Make sure sentence breaking is handling
304# punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS
305# HERE TO MAKE SURE IT DOESN'T CROP UP]
306#
307<data>•Constructs a randomly generated BigInteger, uniformly distributed over the range \<tt>0\</tt> to \<tt>(2\<sup>numBits\</sup> - 1\)\</tt>, inclusive. •The uniformity of the distribution assumes that a fair source of random bits is provided in \<tt>rnd\</tt>. •Note that this constructor always constructs a non-negative biginteger. \n•Ahh abc.
308•</data>
309
310# sentence breaks for hindi which used Devanagari script
311# make sure there is sentence break after ?,danda(hindi phrase separator),
312# fullstop followed by space. (VERY old test)
313#
374ca955 314<data>•\u0928\u092e\u0938\u094d\u200d\u0924\u0947 \u0930\u092e\u0947\u0936\u0905\u093e\u092a\u0915\u0948\u0938\u0947 \u0939\u0948?•\u092e\u0948 \u0905\u091a\u094d\u200d \u091b\u093e \u0939\u0942\u0901\u0964 •\u0905\u093e\u092a\r\n<100>\
b75a7d8f 315\u0915\u0948\u0938\u0947 \u0939\u0948?•\u0935\u0939 \u0915\u094d\u200d\u092f\u093e\n\
374ca955 316<100>\u0939\u0948?•\u092f\u0939 \u0905\u093e\u092e \u0939\u0948. •\u092f\u0939 means "this". •"\u092a\u095d\u093e\u0908" meaning "education" or "studies". •\u0905\u093e\u091c(\u0938\u094d\u200d\u0935\u0924\u0902\u0924\u094d\u0930 \u0926\u093f\u0935\u093e\u0938) \u0939\u0948\u0964 •Let's end here. •</data>
b75a7d8f
A
317
318# Regression test for bug #1984, Sentence break in Arabic text.
319
320<data>\
73c04bcf 321•\u0623\u0633\u0627\u0633\u064b\u0627\u060c\u0020\u062a\u062a\u0639\u0627"\u0645\u0644\u0020\u0627\u0644\u062d\u0648\u0627\u0633\u064a\u0628\u0020"\u0641\u0642\u0637\u0020\u0645\u0639\u0020\u0627\u0644\u0623\u0631\u0642\u0627\u0645\u060c\u0648\u062a\u0642\u0648\u0645\u0020\u0628\u062a\u062e\u0632\u064a\u0646\u0020\u0627\u0644\u0623\u062d\u0631\u0641\u0020\u0648\u0627\u0644\u0645\u062d\u0627\u0631\u0641\u0020\u0627\u0644\u0623\u062e\u0631\u0649\u0020\u0628\u0639\u062f\u0020\u0623\u0646\u062a\u064f\u0639\u0637\u064a\u0020\u0631\u0642\u0645\u0627\u0020\u0645\u0639\u064a\u0646\u0627\u0020\u0644\u0643\u0644\u0020\u0648\u0627\u062d\u062f\u0020\u0645\u0646\u0647\u0627\u002e\u0020•\u0648\u0642\u0628\u0644\u0020\u0627\u062e\u062a\u0631\u0627\u0639\u0022\u064a\u0648\u0646\u0650\u0643\u0648\u062f\u0022\u060c\u0020\u0643\u0627\u0646\u0020\u0647\u0646\u0627\u0643\u0020\u0645\u0626\u0627\u062a\u0020\u0627\u0644\u0623\u0646\u0638\u0645\u0629\u0020\u0644\u0644\u062a\u0634\u0641\u064a\u0631\u0648\u062a\u062e\u0635\u064a\u0635\u0020\u0647\u0630\u0647\u0020\u0627\u0644\u0623\u0631\u0642\u0627\u0645\u0020\u0644\u0644\u0645\u062d\u0627\u0631\u0641\u060c\u0020\u0648\u0644\u0645\u0020\u064a\u0648\u062c\u062f\u0020\u0646\u0638\u0627\u0645\u062a\u0634\u0641\u064a\u0020\u0639\u0644\u0649\u0020\u062c\u0645\u064a\u0639\u0020\u0627\u0644\u0645\u062d\u0627\u0631\u0641\u0020\u0627\u0644\u0636\u0631\u0648\u0631\u064a\u0629. •</data>
b75a7d8f
A
322
323# Try a few more of the less common sentence endings.
324<data>•Hello, world\u3002 •Hello, world\u1803 •Hello, world\u2048 •Hello, world\u203c •Let's end here. •</data>
325
326
327
328
329################################################################
330#
331#
332# L I N E B R E A K
333#
334#
335################################################################
336
337<line>
338#
339# Test Character for each of the line break classes.
340#
341# 00A1;AI # INVERTED EXCLAMATION MARK ¡
342# 0041;AL # LATIN CAPITAL LETTER A
343# 0009;BA # <control>
344# 00B4;BB # ACUTE ACCENT
345# 000C;BK # <control>
346# 2014;B2 # EM DASH
347# FFFC;CB # OBJECT REPLACEMENT CHARACTER
348# 0029;CL # RIGHT PARENTHESIS
349# 0301;CM # COMBINING ACUTE ACCENT
350# 0021;EX # EXCLAMATION MARK
351# 00A0;GL # NO-BREAK SPACE
352# 002D;HY # HYPHEN-MINUS
353# 4E00;ID # <CJK Ideograph, First>
354# 2024;IN # ONE DOT LEADER
355# 002C;IS # COMMA
356# 000A;LF # <control>
357# 0E5A;NS # THAI CHARACTER ANGKHANKHU
358# 0032;NU # DIGIT TWO
359# 0028;OP # LEFT PARENTHESIS
360# 0025;PO # PERCENT SIGN
361# 0024;PR # DOLLAR SIGN
362# 0022;QU # QUOTATION MARK
363# 0E01;SA # THAI CHARACTER KO KAI
364# DB7F;SG # Surrogate
365# 0020;SP # SPACE
366# 002F;SY # SOLIDUS /
367# F8FF;XX # Private Use
368# 200B;ZW # ZERO WIDTH SPACE
369
370
371# 2b Always break at end of text
372
373<data>• •\u00A1•</data>
374<data>• •\u0041•</data>
375<data>• •\u0009•</data>
376<data>• •\u00B4•</data>
374ca955 377<data>• \u000C<100></data> # LB3C × BK
b75a7d8f
A
378<data>• •\u2014•</data>
379<data>• •\uFFFC•</data>
380<data>• \u0029•</data> # LB 8 × CL
381# <data>• • \u0301•</data> # LB 7a Treat SP CM* as if it were ID #TODO: SP CM
382<data>• \u0021•</data> # LB 8 × EX
383#<data>• \u00A0•</data> # LB 11b × GL TODO: fix.
384<data>• •\u002D•</data>
385<data>• •\u4E00•</data>
386<data>• •\u2024•</data>
387<data>• \u002C•</data> # LB 8 × IS
374ca955 388<data>• \u000A<100></data> # LB3C × ( BK | CR | LF | NL )
b75a7d8f
A
389<data>• •\u0E5A•</data>
390<data>• •\u0032•</data>
391<data>• •\u0028•</data>
392<data>• •\u0025•</data>
393<data>• •\u0024•</data>
394<data>• •\u0022•</data>
395<data>• •\u0E01•</data>
396<data>• •\uDB7F•</data>
397<data>• \u0020•</data> # LB4 - don't break before space.
398<data>• \u002F•</data> # LB 8 × SY
399<data>• •\uF8FF•</data>
400<data>• \u200B•</data> # LB4 - don't break before ZA
401
402
403# 3a Always break after hard line breaks.
404# 3c Never break before hard line breaks.
405
374ca955
A
406<data>• •\u00A1\u2028<100>\u00A1•</data>
407<data>• •\u0041\u2028<100>\u0041•</data>
408<data>• •\u0009\u2028<100>\u0009•</data>
409<data>• •\u00B4\u2028<100>\u00B4•</data>
410<data>• \u000C<100>\u2028<100>\u000C<100></data>
411<data>• •\u2014\u2028<100>\u2014•</data>
412<data>• •\uFFFC\u2028<100>\uFFFC•</data>
413<data>• \u0029\u2028<100>\u0029•</data>
414#<data>• \u0301\u2028<100>\u0301•</data> # TODO: fix.
415<data>• \u0021\u2028<100>\u0021•</data>
416#<data>• \u00A0\u2028<100>\u00A0•</data> # TODO: fix
417<data>• •\u002D\u2028<100>\u002D•</data>
418<data>• •\u4E00\u2028<100>\u4E00•</data>
419<data>• •\u2024\u2028<100>\u2024•</data>
420<data>• \u002C\u2028<100>\u002C•</data>
421<data>• \u000A<100>\u2028<100>\u000A<100></data>
422<data>• •\u0E5A\u2028<100>\u0E5A•</data>
423<data>• •\u0032\u2028<100>\u0032•</data>
424<data>• •\u0028\u2028<100>\u0028•</data>
425<data>• •\u0025\u2028<100>\u0025•</data>
426<data>• •\u0024\u2028<100>\u0024•</data>
427<data>• •\u0022\u2028<100>\u0022•</data>
428<data>• •\u0E01\u2028<100>\u0E01•</data>
429<data>• •\uDB7F\u2028<100>\uDB7F•</data>
430<data>• \u0020\u2028<100>\u0020•</data>
431<data>• \u002F\u2028<100>\u002F•</data>
432<data>• •\uF8FF\u2028<100>\uF8FF•</data>
433<data>• \u200B\u2028<100>\u200B•</data>
b75a7d8f
A
434
435
436#
437# Old Line Break Test data. Orginally located in RBBITest::TestDefaultRuleBasedLineIteration()
438#
439
440<line>
441
442<data>•Multi-•Level •example •of •a •semi-•idiotic •non-•sensical •(non-•important) •sentence.
374ca955 443<100>Hi •Hello •How\n<100>are\r<100>you\u2028<100>fine.\t•good. •Now\r<100>is\n<100>the\r\n<100>time\n<100>\r<100>for\r<100>\r<100>all•</data>
b75a7d8f
A
444
445<line>
374ca955
A
446<data>•Hello! •how\r\n<100> •(are)\r<100> •you? •I'am •fine- •Thankyou. •foo\u00a0bar
447<100>How, •are, •you? •This, •costs •$20,00,000.•</data>
b75a7d8f
A
448
449# test for bug #4068133
450#
451<data>•\u96f6•\u4e00\u3002•\u4e8c\u3001•\u4e09\u3002\u3001•\u56db\u3001\u3002\u3001•\u4e94,•\u516d.•\u4e03.\u3001,\u3002•\u516b•</data>
452
453# to test for bug #4086052
454<data>•foo\u00a0bar•</data>
455
456# to test for bug #4097920
374ca955 457<data>•dog,cat,mouse •(one)•(two)\n<100></data>
b75a7d8f
A
458
459# to test for bug #4035266
374ca955 460<data>•The •balance •is •$-23,456.78, •not •-•$32,456.78!\n<100></data>
b75a7d8f
A
461
462
463# to test for bug #4098467
464# What follows is a string of Korean characters (I found it in the Yellow Pages
465# ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
466# it correctly), first as precomposed syllables, and then as conjoining jamo.
467# Both sequences should be semantically identical and break the same way.
468# precomposed syllables... (I == Rich Gillam?)
469#
470<data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•</data>
471
472# conjoining jamo...
473# TODO: rules update needed
474#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
475
476# to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
477<data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data>
478
479# Surrogate line break tests.
480#
374ca955 481<data>•\u4e01•\ud840\udc01•\u4e02•abc •\ue000 •\udb80\udc01•</data>
b75a7d8f
A
482
483# Regression for bug 836
73c04bcf 484<data>•AAA(AAA •</data>
b75a7d8f
A
485
486# Try some words from other scripts.
487# Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin
488#
489<data>•ΑΒΓ •БВГ •אבג֓ •ابت •١٢٣ •\u10A0\u10A1\u10A2 •ABC •</data>
490
491
492########################################################################################
493#
494#
495# T i t l e B o u n d a r y T e s t s
496#
497#
498##########################################################################################
499<title>
500<data>•Here •is •a •short •sample •sentence. •And •another.•</data>
501<data>•HERE •IS •A •SHORT •SAMPLE •SENTENCE. •AND •ANOTHER.•</data>
502<data>• •Start •and •end •with •spaces •</data>
503<data>•Include 123 456 ^& •some 54332 •numbers 4445•abc123•abc •ending 1223 •</data>
504
505<data>•Combining\u0301 \u0301•ma\u0306rks •bye •</data>
506<data>•123 •Start •with •a •number.•</data>
507
508<data>•'•start •with •a •case-•ignorable •cha'r'a'cter•</data>
509
73c04bcf
A
510
511##########################################################################################
512#
513# Thai Tests
514#
515##########################################################################################
516<locale th>
517<word>
518#
519# Test data originally from the test code source file
520# // @suwit -- Thai sample data from GVT Guideline
521#
522<data>•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<200>\
523\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<200>\
524\u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\
525\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data>
526
527#
528# Jitterbug 3671 Test Case
529#
530<data>•สวัสดี<200>ครับ<200>สบาย<200>ดี<200>ไหม<200> •ครับ<200></data>
531
532#
533# Trac ticket 5595 Test Case
534<data>•บท<200>ที่๑พายุ<200>ไซโคลน<200>โด<200>โรธี<200>อาศัย<200>อยู่<200>ท่ามกลาง<200>\
535ทุ่งใหญ่<200>ใน<200>แคนซัส<200>กับ<200>ลุง<200>เฮ<200>นรี<200>ชาวไร่<200>และ<200>ป้า<200>เอ็ม<200>\
536ภรรยา<200>ชาวไร่<200>บ้าน<200>ของ<200>พวก<200>เขา<200>หลัง<200>เล็ก<200>เพราะ<200>ไม้<200>\
537สร้าง<200>บ้าน<200>ต้อง<200>ขน<200>มา<200>ด้วย<200>เกวียน<200>เป็น<200>ระยะ<200>ทาง<200>หลาย<200>\
538ไมล์<200></data>
539
540