ICU-400.37.tar.gz

[apple/icu.git] / icuSources / test / testdata / rbbitst.txt
diff --git a/icuSources/test/testdata/rbbitst.txt b/icuSources/test/testdata/rbbitst.txt

index d336c674e814f58474c0d359f7a4ab1b08ce9587..af947dfc09430b2db5c1930d2bfae0ca418aad7d 100644 (file)
--- a/icuSources/test/testdata/rbbitst.txt
+++ b/icuSources/test/testdata/rbbitst.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2001-2006 International Business Machines
+# Copyright (c) 2001-2008 International Business Machines
  # Corporation and others. All Rights Reserved.
  #
  # RBBI Test Data
@@ -71,14 +71,14 @@
  
  
  # Hindi combining chars.  (An old test)
-<data>•भ••ा•\u0930•\u0924• •\u0938\u0941\u0902•\u0926•\u0930•
-•\u0939•\u094c•\u0964•</data>
-<data>•\u0916\u0947•\u0938\u0941\u0902•\u0926•\u0930•\u0939•\u094c•\u0964•</data>
+#   TODO:  Update these tests for Unicode 5.1 Extended Grapheme clusters 
+#<data>•भ••ा•\u0930•\u0924• •\u0938\u0941\u0902•\u0926•\u0930•
+#•\u0939•\u094c•\u0964•</data>
+#<data>•\u0916\u0947•\u0938\u0941\u0902•\u0926•\u0930•\u0939•\u094c•\u0964•</data>
  
  
-#  Bug 1587.  Tamil.  \u0baa\u0bc1 should be two separate characters, even though
-#                     Hyangmi would perfer that it be one.
-<data>•\u0baa•\u0bc1•\u0baa•\u0bc1•</data>
+#  Bug 1587.  Tamil.  \u0baa\u0bc1 is an Extended Grpaheme Cluster
+<data>•\u0baa\u0bc1•\u0baa\u0bc1•</data>
  
  #   Regression test for bug 1889
  <data>•\u0f40\u0f7d•\u0000•\u0f7e•</data>
@@ -91,6 +91,28 @@
  #  Treat Japanese Half Width voicing marks as combining
  <data>•A\uff9e•B\uff9f\uff9e\uff9f•C•</data>
  
+########################################################################################
+#
+#
+#       E x t e n d e d    G r a p h e m e    C l u s t e r     T e s t s
+#
+#
+##########################################################################################
+#<xgc>
+
+# Plain Vanilla grapheme clusters
+#<data>•a•b•c•</data>
+#<data>•a\u0301\u0302• •b\u0303\u0304•</data>
+
+# Assorted Hindi combining marks
+#<data>•\u0904\u0903• •\u0937\u093E• •\u0904\u093F• •\u0937\u0940• •\u0937\u0949• •\u0937\u094A• •\u0937\u094B• •\u0937\u094C•</data>
+
+# Thai Clusters
+# $Prepend $Extend* $PrependBase $Extend*;
+#
+#<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •</data>
+
+
  ########################################################################################
  #
  #
@@ -481,7 +503,10 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
  <data>•\u4e01•\ud840\udc01•\u4e02•abc •\ue000 •\udb80\udc01•</data>
  
  #      Regression for bug 836
-<data>•AAA(AAA •</data> 
+#        Note:  Unicode 5.1 changed this behavior
+#               ICU will want to change it back before releasing,
+#               so there is no break preceding the '('
+<data>•AAA•(AAA •</data> 
  
  #       Try some words from other scripts.
  #          Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin
@@ -489,6 +514,7 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
  <data>•ΑΒΓ •БВГ •אבג֓ •ابت •١٢٣ •\u10A0\u10A1\u10A2 •ABC •</data>
  
  
+
  ########################################################################################
  #
  #