X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/73c04bcfe1096173b00431f0cdc742894b15eef0..4d9eefca008a7bc544144ef830fa144ce89deaa0:/icuSources/test/testdata/rbbitst.txt?ds=sidebyside

diff --git a/icuSources/test/testdata/rbbitst.txt b/icuSources/test/testdata/rbbitst.txt
index d336c674..49057199 100644
--- a/icuSources/test/testdata/rbbitst.txt
+++ b/icuSources/test/testdata/rbbitst.txt
@@ -1,4 +1,4 @@
-ï»¿# Copyright (c) 2001-2006 International Business Machines
+ï»¿# Copyright (c) 2001-2013 International Business Machines
 # Corporation and others. All Rights Reserved.
 #
 # RBBI Test Data
@@ -20,13 +20,22 @@
 #      \         Escape.  Normal ICU unescape applied.  
 #      \ at end of line  ->  Line Continuation.  Remove both the backslash and the new line
 #   
+# In ICU4C, this test data is run by intltest, rbbi/RBBITest/TestExtended.
+# In ICU4J, this test data is run by com.ibm.icu.dev.test.rbbi.RBBITestExtended
 #
+# There are two copies of this file in the source repository,
+#   [ICU4C]   source/test/testdata/rbbitst.txt
+#   [ICU4J]   main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
+#
+# ICU4C's copy is the master.  If any changes are made to ICU4J's copy, make sure they
+#  are merged back into ICU4C's copy of the file, lest they get overwritten later.
+# TODO:  figure out how to have a single copy of the file for use by both C and Java.
 
 
 #   Temp debugging tests 
-<line>
-#     to test for bug #4097920
-<data>â¢dog,cat,mouse â¢(one)â¢(two)\n<100></data>
+<char>
+<data>â¢\U00010020â¢\U00010000\u0301â¢xâ¢</data>
+<data>â¢\U00010020â¢\U00010000\N{COMBINING MACRON}â¢</data>
 
 ########################################################################################
 #
@@ -71,14 +80,14 @@
 
 
 # Hindi combining chars.  (An old test)
-<data>â¢à¤­â¢â¢à¤¾â¢\u0930â¢\u0924â¢ â¢\u0938\u0941\u0902â¢\u0926â¢\u0930â¢
-â¢\u0939â¢\u094câ¢\u0964â¢</data>
-<data>â¢\u0916\u0947â¢\u0938\u0941\u0902â¢\u0926â¢\u0930â¢\u0939â¢\u094câ¢\u0964â¢</data>
+#   TODO:  Update these tests for Unicode 5.1 Extended Grapheme clusters 
+#<data>â¢à¤­â¢â¢à¤¾â¢\u0930â¢\u0924â¢ â¢\u0938\u0941\u0902â¢\u0926â¢\u0930â¢
+#â¢\u0939â¢\u094câ¢\u0964â¢</data>
+#<data>â¢\u0916\u0947â¢\u0938\u0941\u0902â¢\u0926â¢\u0930â¢\u0939â¢\u094câ¢\u0964â¢</data>
 
 
-#  Bug 1587.  Tamil.  \u0baa\u0bc1 should be two separate characters, even though
-#                     Hyangmi would perfer that it be one.
-<data>â¢\u0baaâ¢\u0bc1â¢\u0baaâ¢\u0bc1â¢</data>
+#  Bug 1587.  Tamil.  \u0baa\u0bc1 is an Extended Grpaheme Cluster
+<data>â¢\u0baa\u0bc1â¢\u0baa\u0bc1â¢</data>
 
 #   Regression test for bug 1889
 <data>â¢\u0f40\u0f7dâ¢\u0000â¢\u0f7eâ¢</data>
@@ -91,6 +100,28 @@
 #  Treat Japanese Half Width voicing marks as combining
 <data>â¢A\uff9eâ¢B\uff9f\uff9e\uff9fâ¢Câ¢</data>
 
+########################################################################################
+#
+#
+#       E x t e n d e d    G r a p h e m e    C l u s t e r     T e s t s
+#
+#
+##########################################################################################
+#<xgc>
+
+# Plain Vanilla grapheme clusters
+#<data>â¢aâ¢bâ¢câ¢</data>
+#<data>â¢a\u0301\u0302â¢ â¢b\u0303\u0304â¢</data>
+
+# Assorted Hindi combining marks
+#<data>â¢\u0904\u0903â¢ â¢\u0937\u093Eâ¢ â¢\u0904\u093Fâ¢ â¢\u0937\u0940â¢ â¢\u0937\u0949â¢ â¢\u0937\u094Aâ¢ â¢\u0937\u094Bâ¢ â¢\u0937\u094Câ¢</data>
+
+# Thai Clusters
+# $Prepend $Extend* $PrependBase $Extend*;
+#
+#<data>â¢\u0e40\u0e01â¢\u0e44\u0301\u0e23\u0302\u0303â¢\u0e40â¢\u0e40\u0e02â¢\u0e02â¢ â¢</data>
+
+
 ########################################################################################
 #
 #
@@ -140,7 +171,23 @@
 <data>â¢abc<200>\U0001D800â¢def<200>\U0001D3FFâ¢ â¢</data>
 
 # Hiragana & Katakana stay together, but separates from each other and Latin.
-<data>â¢abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#â¢</data>
+# *** what to do about theoretical combos of chars? i.e. hiragana + accent
+#<data>â¢abc<200>\N{HIRAGANA LETTER SMALL A}<400>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<400>\N{HIRAGANA ITERATION MARK}<400>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<400>def<200>#â¢</data>
+
+# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth
+<data>â¢è½ã­ã£ãã<400>è½ã­ã£ï¾ï¾ã<400></data>
+
+# more Japanese tests
+# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana
+# and the Katakana block are not treated correctly. Enable this later.
+#<data>â¢ã©ã¼<400>ã<400>æ¥æ¬èª<400>ã<400>åå¼·<400>ãã<400>çç±<400>ã«ã¤ãã¦<400>ãâ¢ã¦<400>ãã¨<400>ã¯<400>æ<400>ã§ã<400>ç¥<400>ã<400>ã<400>ã<400>ãã¨<400>ãªã<400>ã <400>ãâ¢</data>
+<data>â¢æ¥æ¬èª<400>ã<400>åå¼·<400>ãã<400>çç±<400>ã«ã¤ãã¦<400>ãâ¢ã¦<400>ãã¨<400>ã¯<400>æ<400>ã§ã<400>ç¥<400>ã<400>ã<400>ã<400>ãã¨<400>ãªã<400>ã <400>ãâ¢</data>
+
+# Testing of word boundary for dictionary word containing both kanji and kana
+<data>â¢ä¸­ã ãã¿<400>èµçã®æ£®<400>ã¦é¢å³¶<400></data>
+
+# Testing of Chinese segmentation (taken from a Chinese news article)
+<data>â¢400<100>ä½<400>å<400>ä¸­å¤®<400>å§å<400>å<400>ä¸­å¤®<400>åè¡¥<400>å§å<400>é½<400>é¢<400>å°äº<400>ââ¢æ¨è<400>ç¥¨<400>ââ¢ï¼â¢æ<400>èµæ ¼<400>å¨<400>200<100>å¤<400>å<400>ç¬¦å<400>æ¡ä»¶<400>ç<400>63<100>å²<400>ä»¥ä¸<400>ä¸­å±<400>æ­£<400>é¨<400>çº§<400>å¹²é¨<400>ä¸­<400>ï¼â¢éåº<400>ä»ä»¬<400>å±æ<400>ç<400>ä¸­å¤®<400>æ¿æ²»å±<400>å§å<400>ä»¥<400>å<400>æ¿æ²»å±<400>å¸¸å§<400>ä¼<400>ä¸¾è<400>ãâ¢</data>
 
 # Words with interior formatting characters
 <data>â¢def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> â¢</data>
@@ -148,6 +195,9 @@
 # to test for bug #4097779
 <data>â¢aa\N{COMBINING GRAVE ACCENT}a<200> â¢</data>
 
+# fullwidth numeric, midletter characters etc should be treated like their halfwidth counterparts
+# <data>â¢ï¼©ï¼³ï¼®'ï¼´<200> â¢ï¼ï¼<100>æ¥<400></data>
+# why was this added with the dbbi stuff?
 
 #      to test for bug #4098467
 #      What follows is a string of Korean characters (I found it in the Yellow Pages
@@ -157,9 +207,15 @@
 #      precomposed syllables...
 <data>â¢\uc0c1\ud56d<200> â¢\ud55c\uc778<200> â¢\uc5f0\ud569<200> â¢\uc7a5\ub85c\uad50\ud68c<200> â¢\u1109\u1161\u11bc\u1112\u1161\u11bc<200> â¢\u1112\u1161\u11ab\u110b\u1175\u11ab<200> â¢\u110b\u1167\u11ab\u1112\u1161\u11b8<200> â¢\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> â¢</data>
 
-<data>â¢abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> â¢</data>
+# more Korean tests (Jamo not tested here, not counted as dictionary characters)
+# Disable them now because we don't include a Korean dictionary.
+#<data>â¢\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<200>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data>
+#<data>â¢\ud604\uc7ac<200>\ub294<200> â¢\uac80\ucc30<200>\uc774<200> â¢\ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> â¢\uc870\uc0ac<200>\ud560<200> â¢\uac00\ub2a5\uc131<200>\uc740<200> â¢\uc5c6\ub2e4<200>\u002eâ¢</data>
+
+<data>â¢abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> â¢</data>
+
+<data>â¢\u06c9<200>\uc799\ufffaâ¢</data>
 
-<data>â¢\u06c9\uc799\ufffa<200></data>
 
 #      
 #      Try some words from other scripts.
@@ -193,6 +249,9 @@
 #
 <data>â¢A\uff9e\uff9fBC<200> â¢1\uff9e\uff9f23<100></data>
 
+# User guide example:
+<data>â¢Parlez<200>-â¢vous<200> â¢franÃ§ais<200> â¢?â¢</data>
+
 ########################################################################################
 #
 #
@@ -432,6 +491,9 @@ What is the proper use of the abbreviation pp.? â¢Yes, I am definatelly 12" tal
 <data>â¢  â¢\uF8FF\u2028<100>\uF8FFâ¢</data>
 <data>â¢   \u200B\u2028<100>\u200Bâ¢</data>
 
+# User Guide example
+
+<data>â¢Parlez-â¢vous â¢franÃ§ais ?â¢</data>
 
 #
 #  Old Line Break Test data.  Orginally located in RBBITest::TestDefaultRuleBasedLineIteration()
@@ -470,17 +532,19 @@ What is the proper use of the abbreviation pp.? â¢Yes, I am definatelly 12" tal
 <data>â¢\uc0c1â¢\ud56d â¢\ud55câ¢\uc778 â¢\uc5f0â¢\ud569 â¢\uc7a5â¢\ub85câ¢\uad50â¢\ud68câ¢</data>
 
 #      conjoining jamo...
-#      TODO:  rules update needed
-#<data>â¢\u1109\u1161\u11bcâ¢\u1112\u1161\u11bc â¢\u1112\u1161\u11abâ¢\u110b\u1175\u11ab #â¢\u110b\u1167\u11abâ¢\u1112\u1161\u11b8 â¢\u110c\u1161\u11bcâ¢\u1105\u1169â¢\u1100\u116dâ¢\u1112\u116câ¢</data>
+<data>â¢\u1109\u1161\u11bcâ¢\u1112\u1161\u11bc â¢\u1112\u1161\u11abâ¢\u110b\u1175\u11ab â¢\u110b\u1167\u11abâ¢\u1112\u1161\u11b8 â¢\u110c\u1161\u11bcâ¢\u1105\u1169â¢\u1100\u116dâ¢\u1112\u116câ¢</data>
 
 #      to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
 <data>â¢\u4e01\uff0eâ¢\u4e02\uff01â¢\u4e03\uff1fâ¢</data>
 
 #      Surrogate line break tests.
 #
-<data>â¢\u4e01â¢\ud840\udc01â¢\u4e02â¢abc â¢\ue000 â¢\udb80\udc01â¢</data>
+<data>â¢\u4e01â¢\ud840\udc01â¢\u4e02â¢abc â¢\ue000 â¢\udb80\udc01â¢</data>  #This line and the following are equivalent.
+<data>â¢\u4e01â¢\U00020001â¢\u4e02â¢abc â¢\ue000 â¢\U000f0001â¢</data>
 
 #      Regression for bug 836
+#        Note:  Unicode 5.1 changed this behavior
+#               Unicode 5.2 changed it again, there is no break following the '('
 <data>â¢AAA(AAA â¢</data> 
 
 #       Try some words from other scripts.
@@ -488,6 +552,20 @@ What is the proper use of the abbreviation pp.? â¢Yes, I am definatelly 12" tal
 #      
 <data>â¢ÎÎÎ â¢ÐÐÐ â¢×××Ö â¢Ø§Ø¨Øª â¢Ù¡Ù¢Ù£ â¢\u10A0\u10A1\u10A2 â¢ABC â¢</data>
 
+#
+#       ticket #4853:  unpaired surrogates should behave like AL
+#
+<data>â¢abc\ud801xyzâ¢</data>
+
+#
+#     Regression tests for failures that originally came from the monkey test.
+#     Monkey test failure lines can, with slight reformatting, be copied into this section
+#     as test cases.  The error display from here is more informative.
+#
+<data>â¢\ufffcâ¢\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200bâ¢\ufffcâ¢\uaf64â¢\udcfbâ¢</data>
+<data>â¢\u114d\u31f3â¢\ube44\u002dâ¢\u0362\u24e2\u276e\u2014\u205f\ufe16â¢\uc877â¢\u0fd0\u000a<100>\u20a3â¢</data>
+<data>â¢\u080a\u215b\U0001d7d3\u002câ¢\u2025\U000e012eâ¢\u02df\u118d\u0029\ua8d6\u0085<100>\u6cc4\u2024\u202f\ufffcâ¢</data>
+ 
 
 ########################################################################################
 #
@@ -524,6 +602,120 @@ What is the proper use of the abbreviation pp.? â¢Yes, I am definatelly 12" tal
 \u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\
 \u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data>
 
+# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
+<data>â¢à¸à¸¹<200> â¢à¸à¸´à¸<200>à¸à¸¸à¹à¸<200> â¢à¸à¸´à¹à¹<200>à¸à¸­<200>à¸¢à¸¹à¹<200>à¹à¸<200>à¸à¹à¸³<200></data>
+
+<data>â¢\u0E01\u0E39<200>\u0020â¢\u0E01\u0E34\u0E19<200>\u0E01\u0E38\u0E49\u0E07<200>\
+\u0020â¢\u0E1B\u0E34\u0E49\u0E48<200>\u0E07\u0E2D<200>\u0E22\u0E39\u0E48<200>\
+\u0E43\u0E19<200>\u0E16\u0E49\u0E33<200></data>
+
+<line>
+<data>â¢0E01\u0E39\u0020â¢\u0E01\u0E34\u0E19â¢\u0E01\u0E38\u0E49\u0E07\
+\u0020â¢\u0E1B\u0E34\u0E49\u0E48â¢\u0E07\u0E2Dâ¢\u0E22\u0E39\u0E48â¢\
+\u0E43\u0E19â¢\u0E16\u0E49\u0E33â¢</data>
+
+# Data originally from intltest RBBITest::TestThaiLineBreak()
+#
+#  \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
+#  represents elided letters at the end of a long word.  It should be bound to
+#  the end of the word and not treated as an independent punctuation mark.
+#
+#  the one time where the paiyannoi occurs somewhere other than at the end
+#  of a word is in the Thai abbrevation for "etc.", which both begins and
+#  ends with a paiyannoi
+#
+<line>
+<data>â¢\u0e2a\u0e16\u0e32\u0e19\u0e35\u0e2fâ¢\
+\u0e08\u0e30â¢\
+\u0e23\u0e30\u0e14\u0e21â¢\
+\u0e40\u0e08\u0e49\u0e32â¢\
+\u0e2b\u0e19\u0e49\u0e32\u0e17\u0e35\u0e48â¢\
+\u0e2d\u0e2d\u0e01â¢\
+\u0e21\u0e32â¢\
+\u0e40\u0e23\u0e48\u0e07â¢\
+\u0e23\u0e30\u0e1a\u0e32\u0e22â¢\
+\u0e2d\u0e22\u0e48\u0e32\u0e07â¢\
+\u0e40\u0e15\u0e47\u0e21â¢\
+\u0e2f\u0e25\u0e2fâ¢\
+\u0e17\u0e35\u0e48â¢\
+\u0e19\u0e31\u0e49\u0e19â¢</data>
+
+# Data originally from RBBITest::TestMixedThaiLineBreak()
+#   @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters start
+#
+<line>
+<data>â¢\u0E1B\u0E35â¢\
+\u0E1E\u0E38\u0E17\u0E18\u0E28\u0E31\u0E01\u0E23\u0E32\u0E0A â¢\
+2545 â¢\
+\u0E40\u0E1B\u0E47\u0E19â¢\
+\u0E1B\u0E35â¢\
+\u0E09\u0E25\u0E2D\u0E07â¢\
+\u0E04\u0E23\u0E1Aâ¢\
+\u0E23\u0E2D\u0E1A â¢\
+\"\u0E52\u0E52\u0E50 â¢\
+\u0E1b\u0E35\" â¢\
+\u0E02\u0E2d\u0E07â¢\
+\u0E01\u0E23\u0E38\u0E07â¢\
+\u0E23\u0E31\u0E15\u0E19\u0E42\u0E01\u0E2A\u0E34\u0E19\u0E17\u0E23\u0E4C â¢\
+(\u0E01\u0E23\u0E38\u0E07\u0E40\u0E17\u0E1e\u0E2Fâ¢\
+\u0E2B\u0E23\u0E37\u0E2D â¢\
+Bangkok)â¢</data>
+
+# Data originally from RBBITest::TestMaiyamok()
+#   The Thai maiyamok character is a shorthand symbol that means "repeat the previous
+#   word".  Instead of appearing as a word unto itself, however, it's kept together
+#   with the word before it.
+#
+<line>
+<data>â¢\u0e44\u0e1b\u0e46â¢\
+\u0e21\u0e32\u0e46â¢\
+\u0e23\u0e30\u0e2b\u0e27\u0e48\u0e32\u0e07â¢\
+\u0e01\u0e23\u0e38\u0e07â¢\
+\u0e40\u0e17\u0e1eâ¢\
+\u0e41\u0e25\u0e30â¢\
+\u0e40\u0e03\u0e35â¢\
+\u0e22\u0e07â¢\
+\u0e43\u0e2b\u0e21\u0e48â¢</data>
+
+
+
+##########################################################################################
+#
+#   Khmer Tests
+#
+##########################################################################################
+
+# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
+#  from the file testdata/wordsegments.txt
+<locale en>
+<word>
+
+<data>â¢áá¾<200>ááá<200>áá<200>áá¸<200>áááááá<200>áá¶<200></data>
+<data>â¢ááááá¼á<200>á<200>áááááá<200>áááá½á<200></data>
+<data>â¢áááá<200>ááááá<200>áá¶<200>áááá<200></data>
+#áááááá|áá¸|áááá|áá½ááá½á|áá·á|áá¶á|ááááááááá¶|ááá|á¢á¶á|áá¼á<200></data>
+<data>â¢áááááá<200>áá¸<200>áááá<200>ááá<200>áá·á<200>áá¶á<200>ááááááááá¶<200>ááá<200>á¢á¶á<200>áá¼á<200></data>
+#áááááá|áá¸|áááá|áá¶|áá½á|áá·á|áá¶á|ááááááááá¶|ááá|á¢á¶á|áá¼á<200></data>
+<data>â¢áá¼á<200>áááá¶áááá<200>ááááá·á<200>áá¾áááá¸<200>á¢áá·áááá¶á<200>á¢ááááááá»á<200>ááá<200>ááááá¢ááá<200></data>
+<data>â¢áá¶á<200>ááááá¶á<200>ááá·áááá¼áá<200>ááá<200></data>
+<data>â¢áááá¾áááá¶áá<200>ááá¢á¶á<200>áá¶áá<200>áá·ááá<200>áá·ááá<200>ááá<200></data>
+<data>â¢áá¾<200>á¢ááá<200>áááááááºááá<200>á¢ááá¾á¢á¶ááááá<200>áá»á<200>áá¶<200>áá¶á<200></data>
+<data>â¢ááááá¶áá<200>ááááá¶<200>á<200>ááááá¼ááá<200>áááááá<200>ááá<200></data>
+<data>â¢áá¾ááá½<200>ááá»áááá<200>á¥á<200>áááááá<200>ááááá<200>ááá<200>á¡á¾á<200></data>
+<data>â¢áá·á<200>á¢á¶á<200>á²áá<200>áá¾á<200>áááá¾<200>ááá·áááá<200>áá¶á<200>á¡á¾á<200></data>
+<data>â¢áááá<200>áááááá<200>ááááá<200></data>
+<data>â¢áááá»á<200>ááá¼áá·á<200>ááááá¶áá<200>áááá¶<200></data>
+<data>â¢áááá¶á<200>áá»á<200>áááá¶ááá<200>áá<200>áá¶<200></data>
+<data>â¢áá¶á<200>áá»ááá¶á<200>ááá·áááá¼áá<200></data>
+<data>â¢áá¶<200>ááááá<200>áááá»á<200>áá¹á<200>áááá¶áá<200>áááá·á<200>áá¶<200>ááááá<200></data>
+<data>â¢á¯<200>áá¶á<200>áá¶á<200>áááá¶á<200>ááá<200>áá¶á<200></data>
+<data>â¢áá<200>áá¹á<200>áá<200>ááá¡á¶áá<200></data>
+#á¢ááá|ááá|áá¶|ááááááá¸|áá|áá¾|áá¶áá|áá<200></data>
+<data>â¢áá<200>áá<200>ááááá¶ááááá<200>áááá¶áá<200></data>
+<data>â¢áááá¼á<200>áá¶á<200>áá<200>ááááá¶áá<200></data>
+<data>â¢áááááá»á<200>áááá»á<200>ááá<200>áá áááá»á<200></data>
+
+
 #
 #  Jitterbug 3671 Test Case
 #
@@ -537,4 +729,99 @@ What is the proper use of the abbreviation pp.? â¢Yes, I am definatelly 12" tal
 à¸ªà¸£à¹à¸²à¸<200>à¸à¹à¸²à¸<200>à¸à¹à¸­à¸<200>à¸à¸<200>à¸¡à¸²<200>à¸à¹à¸§à¸¢<200>à¹à¸à¸§à¸µà¸¢à¸<200>à¹à¸à¹à¸<200>à¸£à¸°à¸¢à¸°<200>à¸à¸²à¸<200>à¸«à¸¥à¸²à¸¢<200>\
 à¹à¸¡à¸¥à¹<200></data>
 
+####################################################################################
+#
+#  Tailored (locale specific) breaking.
+#
+####################################################################################
+
+# Japanese line break tailoring test
+
+<locale ja>
+<line>
+<data>â¢\u3041â¢\u3043â¢\u3045â¢\u31f1â¢</data>
+<locale en>
+<line>
+<data>â¢\u3041\u3043\u3045\u31f1â¢</data>
+
+# The following data was originally in RBBITest::TestJapaneseWordBreak()
+<locale ja>
+<word>
+<data>â¢\u4ECA\u65E5<400>\u306F<400>\u3044\u3044<400>\u5929\u6C17<400>\u3067\u3059<400>\u306D<400>\u3002â¢\u000D\u000Aâ¢</data>
+
+# UBreakIteratorType UBRK_WORD, Locale "ja"
+# Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
+# \u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002
+# modified to work with dbbi code - should verify
+
+<locale ja>
+<word>
+<data>â¢ç§<400>é<400>ã«<400>ä¸<400>ã<400>ãã<400>ã®<400>ã³ã³ãã¥ã¼ã¿<400>ã<400>ãã<400>ã<0>å¥ã<400>ã¯<400>ã¯ã¼ã<400>ã§<400>ãã<400>ãâ¢</data>
+
+<locale root>
+<word>
+<data>â¢ç§<400>é<400>ã«<400>ä¸<400>ã<400>ãã<400>ã®<400>ã³ã³ãã¥ã¼ã¿<400>ã<400>ãã<400>ã<0>å¥ã<400>ã¯<400>ã¯ã¼ã<400>ã§<400>ãã<400>ãâ¢</data>
+
+# UBreakIteratorType UBRK_SENTENCE, Locale "el"
+# Add break after Greek question mark (cldrbug #2069).
+# "\u0391\u03B2, \u03B3\u03B4; \u0395 \u03B6\u03B7\u037E \u0398 \u03B9\u03BA. "
+# "\u039B\u03BC \u03BD\u03BE! \u039F\u03C0, \u03A1\u03C2? \u03A3"
+# which is "ÎÎ², Î³Î´; Î Î¶Î·Í¾ Î Î¹Îº. ÎÎ¼ Î½Î¾! ÎÏ, Î¡Ï? Î£"
+
+<locale root>
+<sent>
+<data>â¢ÎÎ², Î³Î´; Î Î¶Î·Í¾ Î Î¹Îº. â¢ÎÎ¼ Î½Î¾! â¢ÎÏ, Î¡Ï? â¢Î£<100></data>
+
+<locale el>
+<sent>
+<data>â¢ÎÎ², Î³Î´; â¢Î Î¶Î·Í¾ â¢Î Î¹Îº. â¢ÎÎ¼ Î½Î¾! â¢ÎÏ, Î¡Ï? â¢Î£<100></data>
+
+#  UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
+#  Words don't include colon or period (cldrbug #1969).
+
+<locale en_US>
+<word>
+<data>â¢Can't<200> â¢have<200> â¢breaks<200> â¢in<200> â¢xx:yy<200> â¢or<200> â¢struct.field<200> \
+â¢for<200> â¢CS<200>-â¢types<200>.â¢</data>
+
+<locale en_US_POSIX>
+<word>
+<data>â¢Can't<200> â¢have<200> â¢breaks<200> â¢in<200> â¢xx<200>:â¢yy<200> â¢or<200> â¢struct<200>.â¢field<200> \
+â¢for<200> â¢CS<200>-â¢types<200>.â¢</data>
+
+
+# UBreakIteratorType UBRK_CHARACTER, Locale "th"
+# Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
+# Update:  As of Unicode 6.1 root has same behavior as th for this.
+#
+# "\u0E01\u0E23\u0E30\u0E17\u0E48\u0E2D\u0E21\u0E23\u0E08\u0E19\u0E32 "
+# "(\u0E2A\u0E38\u0E0A\u0E32\u0E15\u0E34-\u0E08\u0E38\u0E11\u0E32\u0E21\u0E32\u0E28) "
+# "\u0E40\u0E14\u0E47\u0E01\u0E21\u0E35\u0E1B\u0E31\u0E0D\u0E2B\u0E32 "
+# which is "à¸à¸£à¸°à¸à¹à¸­à¸¡à¸£à¸à¸à¸² (à¸ªà¸¸à¸à¸²à¸à¸´-à¸à¸¸à¸à¸²à¸¡à¸²à¸¨) à¹à¸à¹à¸à¸¡à¸µà¸à¸±à¸à¸«à¸² "
+
+<locale th>
+<char>
+<data>â¢\u0E01â¢\u0E23â¢\u0E30â¢\u0E17\u0E48â¢\u0E2Dâ¢\u0E21â¢\u0E23â¢\u0E08â¢\u0E19â¢\u0E32â¢ â¢\
+(â¢\u0E2A\u0E38â¢\u0E0Aâ¢\u0E32â¢\u0E15\u0E34â¢-â¢\u0E08\u0E38â¢\u0E11â¢\u0E32â¢\u0E21â¢\u0E32â¢\u0E28â¢)â¢ â¢\
+\u0E40â¢\u0E14\u0E47â¢\u0E01â¢\u0E21\u0E35â¢\u0E1B\u0E31â¢\u0E0Dâ¢\u0E2Bâ¢\u0E32â¢ â¢</data>
+
+# Finnish line breaking
+#
+# These rules deal with hyphens when there is a space on the leading side. 
+# There should be a break opportunity between the space and the hyphen, and not after the hyphen.
+# See CLDR ticket 3029.
+# See ICU ticket 8151 
+
+<locale root>
+<line>
+<data>â¢abc â¢- â¢def    â¢abc â¢-â¢def    â¢abc- â¢def   â¢abc-â¢defâ¢</data>   # With ASCII hyphen
+<data>â¢abc â¢â â¢def    â¢abc â¢ââ¢def    â¢abcâ â¢def   â¢abcââ¢defâ¢</data>   # With Unicode u2010 hyphen
+
+<locale fi>
+<line>
+# TODO: problems with Finnish line break rules cause these two lines to fail.
+#<data>â¢abc â¢- â¢def    â¢abc â¢-def    â¢abc- â¢def   â¢abc-â¢defâ¢</data>   # With ASCII hyphen
+#<data>â¢abc â¢â â¢def    â¢abc â¢âdef    â¢abcâ â¢def   â¢abcââ¢defâ¢</data>   # With Unicode u2010 hyphen
 
+<data>â¢abc â¢- â¢def    â¢abc â¢-def    â¢abc- â¢def   â¢</data>   # With ASCII hyphen
+<data>â¢abc â¢â â¢def    â¢abc â¢âdef    â¢abcâ â¢def   â¢</data>   # With Unicode u2010 hyphen