ICU-511.31.tar.gz

[apple/icu.git] / icuSources / test / testdata / rbbitst.txt
diff --git a/icuSources/test/testdata/rbbitst.txt b/icuSources/test/testdata/rbbitst.txt

index d336c674e814f58474c0d359f7a4ab1b08ce9587..4905719970b2728425899eec09c12fa0debea5b4 100644 (file)
--- a/icuSources/test/testdata/rbbitst.txt
+++ b/icuSources/test/testdata/rbbitst.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2001-2006 International Business Machines
+# Copyright (c) 2001-2013 International Business Machines
  # Corporation and others. All Rights Reserved.
  #
  # RBBI Test Data
@@ -20,13 +20,22 @@
  #      \         Escape.  Normal ICU unescape applied.  
  #      \ at end of line  ->  Line Continuation.  Remove both the backslash and the new line
  #   
+# In ICU4C, this test data is run by intltest, rbbi/RBBITest/TestExtended.
+# In ICU4J, this test data is run by com.ibm.icu.dev.test.rbbi.RBBITestExtended
  #
+# There are two copies of this file in the source repository,
+#   [ICU4C]   source/test/testdata/rbbitst.txt
+#   [ICU4J]   main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
+#
+# ICU4C's copy is the master.  If any changes are made to ICU4J's copy, make sure they
+#  are merged back into ICU4C's copy of the file, lest they get overwritten later.
+# TODO:  figure out how to have a single copy of the file for use by both C and Java.
  
  
  #   Temp debugging tests 
-<line>
-#     to test for bug #4097920
-<data>•dog,cat,mouse •(one)•(two)\n<100></data>
+<char>
+<data>•\U00010020•\U00010000\u0301•x•</data>
+<data>•\U00010020•\U00010000\N{COMBINING MACRON}•</data>
  
  ########################################################################################
  #
@@ -71,14 +80,14 @@
  
  
  # Hindi combining chars.  (An old test)
-<data>•भ••ा•\u0930•\u0924• •\u0938\u0941\u0902•\u0926•\u0930•
-•\u0939•\u094c•\u0964•</data>
-<data>•\u0916\u0947•\u0938\u0941\u0902•\u0926•\u0930•\u0939•\u094c•\u0964•</data>
+#   TODO:  Update these tests for Unicode 5.1 Extended Grapheme clusters 
+#<data>•भ••ा•\u0930•\u0924• •\u0938\u0941\u0902•\u0926•\u0930•
+#•\u0939•\u094c•\u0964•</data>
+#<data>•\u0916\u0947•\u0938\u0941\u0902•\u0926•\u0930•\u0939•\u094c•\u0964•</data>
  
  
-#  Bug 1587.  Tamil.  \u0baa\u0bc1 should be two separate characters, even though
-#                     Hyangmi would perfer that it be one.
-<data>•\u0baa•\u0bc1•\u0baa•\u0bc1•</data>
+#  Bug 1587.  Tamil.  \u0baa\u0bc1 is an Extended Grpaheme Cluster
+<data>•\u0baa\u0bc1•\u0baa\u0bc1•</data>
  
  #   Regression test for bug 1889
  <data>•\u0f40\u0f7d•\u0000•\u0f7e•</data>
@@ -91,6 +100,28 @@
  #  Treat Japanese Half Width voicing marks as combining
  <data>•A\uff9e•B\uff9f\uff9e\uff9f•C•</data>
  
+########################################################################################
+#
+#
+#       E x t e n d e d    G r a p h e m e    C l u s t e r     T e s t s
+#
+#
+##########################################################################################
+#<xgc>
+
+# Plain Vanilla grapheme clusters
+#<data>•a•b•c•</data>
+#<data>•a\u0301\u0302• •b\u0303\u0304•</data>
+
+# Assorted Hindi combining marks
+#<data>•\u0904\u0903• •\u0937\u093E• •\u0904\u093F• •\u0937\u0940• •\u0937\u0949• •\u0937\u094A• •\u0937\u094B• •\u0937\u094C•</data>
+
+# Thai Clusters
+# $Prepend $Extend* $PrependBase $Extend*;
+#
+#<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •</data>
+
+
  ########################################################################################
  #
  #
@@ -140,7 +171,23 @@
  <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>
  
  # Hiragana & Katakana stay together, but separates from each other and Latin.
-<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>
+# *** what to do about theoretical combos of chars? i.e. hiragana + accent
+#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<400>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<400>\N{HIRAGANA ITERATION MARK}<400>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<400>def<200>#•</data>
+
+# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth
+<data>•芽キャベツ<400>芽キャﾍﾞツ<400></data>
+
+# more Japanese tests
+# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana
+# and the Katakana block are not treated correctly. Enable this later.
+#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400>　•て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>
+<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400>　•て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>
+
+# Testing of word boundary for dictionary word containing both kanji and kana
+<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data>
+
+# Testing of Chinese segmentation (taken from a Chinese news article)
+<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400>到了<400>“•推荐<400>票<400>”•，•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400>的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>，•选出<400>他们<400>属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</data>
  
  # Words with interior formatting characters
  <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data>
@@ -148,6 +195,9 @@
  # to test for bug #4097779
  <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
  
+# fullwidth numeric, midletter characters etc should be treated like their halfwidth counterparts
+# <data>•ＩＳＮ'Ｔ<200> •１９<100>日<400></data>
+# why was this added with the dbbi stuff?
  
  #      to test for bug #4098467
  #      What follows is a string of Korean characters (I found it in the Yellow Pages
@@ -157,9 +207,15 @@
  #      precomposed syllables...
  <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>
  
-<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>
+# more Korean tests (Jamo not tested here, not counted as dictionary characters)
+# Disable them now because we don't include a Korean dictionary.
+#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<200>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data>
+#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data>
+
+<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</data>
+
+<data>•\u06c9<200>\uc799\ufffa•</data>
  
-<data>•\u06c9\uc799\ufffa<200></data>
  
  #      
  #      Try some words from other scripts.
@@ -193,6 +249,9 @@
  #
  <data>•A\uff9e\uff9fBC<200> •1\uff9e\uff9f23<100></data>
  
+# User guide example:
+<data>•Parlez<200>-•vous<200> •français<200> •?•</data>
+
  ########################################################################################
  #
  #
@@ -432,6 +491,9 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
  <data>•  •\uF8FF\u2028<100>\uF8FF•</data>
  <data>•   \u200B\u2028<100>\u200B•</data>
  
+# User Guide example
+
+<data>•Parlez-•vous •français ?•</data>
  
  #
  #  Old Line Break Test data.  Orginally located in RBBITest::TestDefaultRuleBasedLineIteration()
@@ -470,17 +532,19 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
  <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•</data>
  
  #      conjoining jamo...
-#      TODO:  rules update needed
-#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
+<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
  
  #      to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
  <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data>
  
  #      Surrogate line break tests.
  #
-<data>•\u4e01•\ud840\udc01•\u4e02•abc •\ue000 •\udb80\udc01•</data>
+<data>•\u4e01•\ud840\udc01•\u4e02•abc •\ue000 •\udb80\udc01•</data>  #This line and the following are equivalent.
+<data>•\u4e01•\U00020001•\u4e02•abc •\ue000 •\U000f0001•</data>
  
  #      Regression for bug 836
+#        Note:  Unicode 5.1 changed this behavior
+#               Unicode 5.2 changed it again, there is no break following the '('
  <data>•AAA(AAA •</data> 
  
  #       Try some words from other scripts.
@@ -488,6 +552,20 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
  #      
  <data>•ΑΒΓ •БВГ •אבג֓ •ابت •١٢٣ •\u10A0\u10A1\u10A2 •ABC •</data>
  
+#
+#       ticket #4853:  unpaired surrogates should behave like AL
+#
+<data>•abc\ud801xyz•</data>
+
+#
+#     Regression tests for failures that originally came from the monkey test.
+#     Monkey test failure lines can, with slight reformatting, be copied into this section
+#     as test cases.  The error display from here is more informative.
+#
+<data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb•</data>
+<data>•\u114d\u31f3•\ube44\u002d•\u0362\u24e2\u276e\u2014\u205f\ufe16•\uc877•\u0fd0\u000a<100>\u20a3•</data>
+<data>•\u080a\u215b\U0001d7d3\u002c•\u2025\U000e012e•\u02df\u118d\u0029\ua8d6\u0085<100>\u6cc4\u2024\u202f\ufffc•</data>
+ 
  
  ########################################################################################
  #
@@ -524,6 +602,120 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
  \u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\
  \u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data>
  
+# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
+<data>•กู<200> •กิน<200>กุ้ง<200> •ปิ้่<200>งอ<200>ยู่<200>ใน<200>ถ้ำ<200></data>
+
+<data>•\u0E01\u0E39<200>\u0020•\u0E01\u0E34\u0E19<200>\u0E01\u0E38\u0E49\u0E07<200>\
+\u0020•\u0E1B\u0E34\u0E49\u0E48<200>\u0E07\u0E2D<200>\u0E22\u0E39\u0E48<200>\
+\u0E43\u0E19<200>\u0E16\u0E49\u0E33<200></data>
+
+<line>
+<data>•0E01\u0E39\u0020•\u0E01\u0E34\u0E19•\u0E01\u0E38\u0E49\u0E07\
+\u0020•\u0E1B\u0E34\u0E49\u0E48•\u0E07\u0E2D•\u0E22\u0E39\u0E48•\
+\u0E43\u0E19•\u0E16\u0E49\u0E33•</data>
+
+# Data originally from intltest RBBITest::TestThaiLineBreak()
+#
+#  \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
+#  represents elided letters at the end of a long word.  It should be bound to
+#  the end of the word and not treated as an independent punctuation mark.
+#
+#  the one time where the paiyannoi occurs somewhere other than at the end
+#  of a word is in the Thai abbrevation for "etc.", which both begins and
+#  ends with a paiyannoi
+#
+<line>
+<data>•\u0e2a\u0e16\u0e32\u0e19\u0e35\u0e2f•\
+\u0e08\u0e30•\
+\u0e23\u0e30\u0e14\u0e21•\
+\u0e40\u0e08\u0e49\u0e32•\
+\u0e2b\u0e19\u0e49\u0e32\u0e17\u0e35\u0e48•\
+\u0e2d\u0e2d\u0e01•\
+\u0e21\u0e32•\
+\u0e40\u0e23\u0e48\u0e07•\
+\u0e23\u0e30\u0e1a\u0e32\u0e22•\
+\u0e2d\u0e22\u0e48\u0e32\u0e07•\
+\u0e40\u0e15\u0e47\u0e21•\
+\u0e2f\u0e25\u0e2f•\
+\u0e17\u0e35\u0e48•\
+\u0e19\u0e31\u0e49\u0e19•</data>
+
+# Data originally from RBBITest::TestMixedThaiLineBreak()
+#   @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters start
+#
+<line>
+<data>•\u0E1B\u0E35•\
+\u0E1E\u0E38\u0E17\u0E18\u0E28\u0E31\u0E01\u0E23\u0E32\u0E0A •\
+2545 •\
+\u0E40\u0E1B\u0E47\u0E19•\
+\u0E1B\u0E35•\
+\u0E09\u0E25\u0E2D\u0E07•\
+\u0E04\u0E23\u0E1A•\
+\u0E23\u0E2D\u0E1A •\
+\"\u0E52\u0E52\u0E50 •\
+\u0E1b\u0E35\" •\
+\u0E02\u0E2d\u0E07•\
+\u0E01\u0E23\u0E38\u0E07•\
+\u0E23\u0E31\u0E15\u0E19\u0E42\u0E01\u0E2A\u0E34\u0E19\u0E17\u0E23\u0E4C •\
+(\u0E01\u0E23\u0E38\u0E07\u0E40\u0E17\u0E1e\u0E2F•\
+\u0E2B\u0E23\u0E37\u0E2D •\
+Bangkok)•</data>
+
+# Data originally from RBBITest::TestMaiyamok()
+#   The Thai maiyamok character is a shorthand symbol that means "repeat the previous
+#   word".  Instead of appearing as a word unto itself, however, it's kept together
+#   with the word before it.
+#
+<line>
+<data>•\u0e44\u0e1b\u0e46•\
+\u0e21\u0e32\u0e46•\
+\u0e23\u0e30\u0e2b\u0e27\u0e48\u0e32\u0e07•\
+\u0e01\u0e23\u0e38\u0e07•\
+\u0e40\u0e17\u0e1e•\
+\u0e41\u0e25\u0e30•\
+\u0e40\u0e03\u0e35•\
+\u0e22\u0e07•\
+\u0e43\u0e2b\u0e21\u0e48•</data>
+
+
+
+##########################################################################################
+#
+#   Khmer Tests
+#
+##########################################################################################
+
+# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
+#  from the file testdata/wordsegments.txt
+<locale en>
+<word>
+
+<data>•តើ<200>លោក<200>មក<200>ពី<200>ប្រទេស<200>ណា<200></data>
+<data>•សណ្ដូក<200>ក<200>បណ្ដែត<200>ខ្លួន<200></data>
+<data>•ពណ៌ស<200>ម្ដេច<200>ថា<200>ខ្មៅ<200></data>
+#ប្រយោគ|ពី|របៀប|រួបរួម|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200></data>
+<data>•ប្រយោគ<200>ពី<200>របៀប<200>ដែល<200>និង<200>ភាព<200>ផ្សេងគ្នា<200>ដែល<200>អាច<200>ចូល<200></data>
+#ប្រយោគ|ពី|របៀប|ជា|មួយ|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200></data>
+<data>•សូម<200>ចំណាយពេល<200>បន្តិច<200>ដើម្បី<200>អធិស្ឋាន<200>អរព្រះគុណ<200>ដល់<200>ព្រះអង្គ<200></data>
+<data>•ការ<200>ថោកទាប<200>បរិប្បូណ៌<200>ដោយ<200></data>
+<data>•ប្រើប្រាស់<200>ស្អាត<200>ទាំង<200>ចិត្ត<200>សិស្ស<200>នោះ<200></data>
+<data>•បើ<200>អ្នក<200>ប្រព្រឺត្ត<200>អំពើអាក្រក់<200>មុខ<200>ជា<200>មាន<200></data>
+<data>•ប្រដាប់<200>ប្រដា<200>រ<200>រៀនសូត្រ<200>បន្ទប់<200>រៀន<200></data>
+<data>•ដើរតួ<200>មនុស្សគ<200>ឥត<200>បញ្ចេញ<200>យោបល់<200>សោះ<200>ឡើយ<200></data>
+<data>•មិន<200>អាច<200>ឲ្យ<200>យើង<200>ធ្វើ<200>កសិកម្ម<200>បាន<200>ឡើយ<200></data>
+<data>•បន្ត<200>សេចក្ត<200>ទៅទៀត<200></data>
+<data>•ក្រុម<200>ប៉ូលិស<200>បណ្តាក់<200>គ្នា<200></data>
+<data>•គ្មាន<200>សុខ<200>សំរាន្ត<200>ដង<200>ណា<200></data>
+<data>•បាន<200>សុខភាព<200>បរិប្បូណ៌<200></data>
+<data>•ជា<200>មេចោរ<200>ខ្ញុំ<200>នឹង<200>ស្លាប់<200>ទៅវិញ<200>ជា<200>មេចោរ<200></data>
+<data>•ឯ<200>ការ<200>វាយ<200>ផ្ចាល<200>ដែល<200>នាំ<200></data>
+<data>•គេ<200>ដឹក<200>ទៅ<200>សំឡាប់<200></data>
+#អ្នក|ដែល|ជា|មន្ត្រី|ធំ|លើ|គាត់|ទេ<200></data>
+<data>•យក<200>ទៅ<200>សម្លាប់ចោល<200>ស្ងាត់<200></data>
+<data>•ត្រូវ<200>បាន<200>គេ<200>សម្លាប់<200></data>
+<data>•នៅក្នុង<200>ស្រុក<200>ខ្ល<200>ងហ្ស៊ុន<200></data>
+
+
  #
  #  Jitterbug 3671 Test Case
  #
@@ -537,4 +729,99 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
  สร้าง<200>บ้าน<200>ต้อง<200>ขน<200>มา<200>ด้วย<200>เกวียน<200>เป็น<200>ระยะ<200>ทาง<200>หลาย<200>\
  ไมล์<200></data>
  
+####################################################################################
+#
+#  Tailored (locale specific) breaking.
+#
+####################################################################################
+
+# Japanese line break tailoring test
+
+<locale ja>
+<line>
+<data>•\u3041•\u3043•\u3045•\u31f1•</data>
+<locale en>
+<line>
+<data>•\u3041\u3043\u3045\u31f1•</data>
+
+# The following data was originally in RBBITest::TestJapaneseWordBreak()
+<locale ja>
+<word>
+<data>•\u4ECA\u65E5<400>\u306F<400>\u3044\u3044<400>\u5929\u6C17<400>\u3067\u3059<400>\u306D<400>\u3002•\u000D\u000A•</data>
+
+# UBreakIteratorType UBRK_WORD, Locale "ja"
+# Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
+# \u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002
+# modified to work with dbbi code - should verify
+
+<locale ja>
+<word>
+<data>•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈々<400>は<400>ワード<400>で<400>ある<400>。•</data>
+
+<locale root>
+<word>
+<data>•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈々<400>は<400>ワード<400>で<400>ある<400>。•</data>
+
+# UBreakIteratorType UBRK_SENTENCE, Locale "el"
+# Add break after Greek question mark (cldrbug #2069).
+# "\u0391\u03B2, \u03B3\u03B4; \u0395 \u03B6\u03B7\u037E \u0398 \u03B9\u03BA. "
+# "\u039B\u03BC \u03BD\u03BE! \u039F\u03C0, \u03A1\u03C2? \u03A3"
+# which is "Αβ, γδ; Ε ζη; Θ ικ. Λμ νξ! Οπ, Ρς? Σ"
+
+<locale root>
+<sent>
+<data>•Αβ, γδ; Ε ζη; Θ ικ. •Λμ νξ! •Οπ, Ρς? •Σ<100></data>
+
+<locale el>
+<sent>
+<data>•Αβ, γδ; •Ε ζη; •Θ ικ. •Λμ νξ! •Οπ, Ρς? •Σ<100></data>
+
+#  UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
+#  Words don't include colon or period (cldrbug #1969).
+
+<locale en_US>
+<word>
+<data>•Can't<200> •have<200> •breaks<200> •in<200> •xx:yy<200> •or<200> •struct.field<200> \
+•for<200> •CS<200>-•types<200>.•</data>
+
+<locale en_US_POSIX>
+<word>
+<data>•Can't<200> •have<200> •breaks<200> •in<200> •xx<200>:•yy<200> •or<200> •struct<200>.•field<200> \
+•for<200> •CS<200>-•types<200>.•</data>
+
+
+# UBreakIteratorType UBRK_CHARACTER, Locale "th"
+# Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
+# Update:  As of Unicode 6.1 root has same behavior as th for this.
+#
+# "\u0E01\u0E23\u0E30\u0E17\u0E48\u0E2D\u0E21\u0E23\u0E08\u0E19\u0E32 "
+# "(\u0E2A\u0E38\u0E0A\u0E32\u0E15\u0E34-\u0E08\u0E38\u0E11\u0E32\u0E21\u0E32\u0E28) "
+# "\u0E40\u0E14\u0E47\u0E01\u0E21\u0E35\u0E1B\u0E31\u0E0D\u0E2B\u0E32 "
+# which is "กระท่อมรจนา (สุชาติ-จุฑามาศ) เด็กมีปัญหา "
+
+<locale th>
+<char>
+<data>•\u0E01•\u0E23•\u0E30•\u0E17\u0E48•\u0E2D•\u0E21•\u0E23•\u0E08•\u0E19•\u0E32• •\
+(•\u0E2A\u0E38•\u0E0A•\u0E32•\u0E15\u0E34•-•\u0E08\u0E38•\u0E11•\u0E32•\u0E21•\u0E32•\u0E28•)• •\
+\u0E40•\u0E14\u0E47•\u0E01•\u0E21\u0E35•\u0E1B\u0E31•\u0E0D•\u0E2B•\u0E32• •</data>
+
+# Finnish line breaking
+#
+# These rules deal with hyphens when there is a space on the leading side. 
+# There should be a break opportunity between the space and the hyphen, and not after the hyphen.
+# See CLDR ticket 3029.
+# See ICU ticket 8151 
+
+<locale root>
+<line>
+<data>•abc •- •def    •abc •-•def    •abc- •def   •abc-•def•</data>   # With ASCII hyphen
+<data>•abc •‐ •def    •abc •‐•def    •abc‐ •def   •abc‐•def•</data>   # With Unicode u2010 hyphen
+
+<locale fi>
+<line>
+# TODO: problems with Finnish line break rules cause these two lines to fail.
+#<data>•abc •- •def    •abc •-def    •abc- •def   •abc-•def•</data>   # With ASCII hyphen
+#<data>•abc •‐ •def    •abc •‐def    •abc‐ •def   •abc‐•def•</data>   # With Unicode u2010 hyphen
  
+<data>•abc •- •def    •abc •-def    •abc- •def   •</data>   # With ASCII hyphen
+<data>•abc •‐ •def    •abc •‐def    •abc‐ •def   •</data>   # With Unicode u2010 hyphen