ICU-531.48.tar.gz

author Apple <opensource@apple.com>

Tue, 10 Mar 2015 17:57:33 +0000 (17:57 +0000)

committer Apple <opensource@apple.com>

Tue, 10 Mar 2015 17:57:33 +0000 (17:57 +0000)
author Apple <opensource@apple.com>
Tue, 10 Mar 2015 17:57:33 +0000 (17:57 +0000)
committer Apple <opensource@apple.com>
Tue, 10 Mar 2015 17:57:33 +0000 (17:57 +0000)
diff --git a/icuSources/common/brkiter.cpp b/icuSources/common/brkiter.cpp

index 5931ebf7fbd70fe796dbe1851edf5121f6722279..b6ff490c1573cb673e408928ecd6576a73853b4f 100644 (file)
--- a/icuSources/common/brkiter.cpp
+++ b/icuSources/common/brkiter.cpp
@@ -389,6 +389,7 @@ BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& statu
  }
  
  // -------------------------------------
+enum { kLBTypeLenMax = 32 };
  
  BreakIterator*
  BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
@@ -397,6 +398,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
      if (U_FAILURE(status)) {
          return NULL;
      }
+    char lbType[kLBTypeLenMax];
  
      BreakIterator *result = NULL;
      switch (kind) {
@@ -407,7 +409,17 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
          result = BreakIterator::buildInstance(loc, "word", kind, status);
          break;
      case UBRK_LINE:
-        result = BreakIterator::buildInstance(loc, "line", kind, status);
+        uprv_strcpy(lbType, "line");
+        {
+            char lbKeyValue[kLBTypeLenMax];
+            UErrorCode kvStatus = U_ZERO_ERROR;
+            loc.getKeywordValue("lb", lbKeyValue, kLBTypeLenMax, kvStatus);
+            if (U_SUCCESS(kvStatus) && (uprv_strcmp(lbKeyValue,"strict")==0 || uprv_strcmp(lbKeyValue,"normal")==0 || uprv_strcmp(lbKeyValue,"loose")==0)) {
+                uprv_strcat(lbType, "_");
+                uprv_strcat(lbType, lbKeyValue);
+            }
+        }
+        result = BreakIterator::buildInstance(loc, lbType, kind, status);
          break;
      case UBRK_SENTENCE:
          result = BreakIterator::buildInstance(loc, "sentence", kind, status);
diff --git a/icuSources/common/ualoc.cpp b/icuSources/common/ualoc.cpp

index 21a0ec9ab998bf1e40811ab308fce745e341a22f..d61accb21272341cef77357e1753cedb0d677ba0 100644 (file)
--- a/icuSources/common/ualoc.cpp
+++ b/icuSources/common/ualoc.cpp
@@ -188,6 +188,7 @@ ualoc_getAppleParent(const char* localeID,
      int32_t len;
      UErrorCode tempStatus;
      char locbuf[ULOC_FULLNAME_CAPACITY+1];
+    char * foundDoubleUnderscore;
  
      if (U_FAILURE(*err)) {
          return 0;
@@ -196,7 +197,7 @@ ualoc_getAppleParent(const char* localeID,
          *err = U_ILLEGAL_ARGUMENT_ERROR;
          return 0;
      }
-    len = uloc_canonicalize(localeID, locbuf, ULOC_FULLNAME_CAPACITY, err);
+    len = uloc_getBaseName(localeID, locbuf, ULOC_FULLNAME_CAPACITY, err); /* canonicalize and strip keywords */
      if (U_FAILURE(*err)) {
          return 0;
      }
@@ -204,6 +205,11 @@ ualoc_getAppleParent(const char* localeID,
          locbuf[ULOC_FULLNAME_CAPACITY] = 0;
          *err = U_ZERO_ERROR;
      }
+    foundDoubleUnderscore = uprv_strstr(locbuf, "__"); /* __ comes from bad/missing subtag or variant */
+    if (foundDoubleUnderscore != NULL) {
+        *foundDoubleUnderscore = 0; /* terminate at the __ */
+        len = uprv_strlen(locbuf);
+    }
      if (len >= 2 && uprv_strncmp(locbuf, "zh", 2) == 0) {
          const char ** forceParentPtr = forceParent;
          const char * testCurLoc;
diff --git a/icuSources/common/ubidi_props_data.h b/icuSources/common/ubidi_props_data.h

index 7c66df44060bce85698d4dd6051fff49b5ad30c4..3beb207dd34f03f328cb1c9b9ec24b6bce31c361 100644 (file)
--- a/icuSources/common/ubidi_props_data.h
+++ b/icuSources/common/ubidi_props_data.h
@@ -16,13 +16,13 @@ static const UVersionInfo ubidi_props_dataVersion={6,3,0,0};
  #if !U_PLATFORM_IS_DARWIN_BASED
  static const int32_t ubidi_props_indexes[UBIDI_IX_TOP]={0x10,0x53f0,0x50b8,0x1a,0x620,0x8b0,0,0,0,0,0,0,0,0,0,0x3902b6};
  #else /* !U_PLATFORM_IS_DARWIN_BASED */
-static const int32_t ubidi_props_indexes[UBIDI_IX_TOP]={0x10,0x54f0,0x51b8,0x1a,0x620,0x8b0,0,0,0,0,0,0,0,0,0,0x3902b6};
+static const int32_t ubidi_props_indexes[UBIDI_IX_TOP]={0x10,0x54e8,0x51b0,0x1a,0x620,0x8b0,0,0,0,0,0,0,0,0,0,0x3902b6};
  #endif /* !U_PLATFORM_IS_DARWIN_BASED */
  
  #if !U_PLATFORM_IS_DARWIN_BASED
  static const uint16_t ubidi_props_trieIndex[10324]={
  #else /* !U_PLATFORM_IS_DARWIN_BASED */
-static const uint16_t ubidi_props_trieIndex[10452]={
+static const uint16_t ubidi_props_trieIndex[10448]={
  #endif /* !U_PLATFORM_IS_DARWIN_BASED */
  0x320,0x328,0x330,0x338,0x350,0x358,0x360,0x368,0x340,0x348,0x340,0x348,0x340,0x348,0x340,0x348,
  0x340,0x348,0x340,0x348,0x36e,0x376,0x37e,0x386,0x38e,0x396,0x392,0x39a,0x3a2,0x3aa,0x3a5,0x3ad,
@@ -256,8 +256,8 @@ static const uint16_t ubidi_props_trieIndex[10452]={
  0x3d5,0x3d5,0x3d5,0x43f,0x43f,0x43f,0x43f,0x43f,0x43f,0x43f,0x9a7,0x3d5,0x3d5,0x3d5,0x3d5,0x3d5,
  0x3d5,0x3d5,0x3d5,0x5ee,0x7b9,0x5ee,0x5ee,0x5f1,0x9b7,0x9bf,0x340,0x9af,0x340,0x340,0x9c7,0x340,
  0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x5ee,0x9cf,0x5ee,0x9d5,0x5f1,
-0x5ee,0x9dd,0x9e5,0x5ee,0x9ed,0x9f5,0x5ee,0x5ee,0x5ee,0x5ee,0x9f7,0x5ee,0x9ff,0xa07,0x7f0,0x340,
-0x340,0x340,0x6fb,0x5ee,0x5ee,0xa0f,0x340,0x5ee,0x5ee,0x6f9,0x340,0x5ee,0x5ee,0x5ee,0x5f1,0x340,
+0x5ee,0x9dd,0x9e5,0x5ee,0x9ec,0x9f4,0x5ee,0x5ee,0x5ee,0x5ee,0x9f6,0x5ee,0x9fe,0xa06,0x7f0,0x340,
+0x340,0x340,0x6fb,0x5ee,0x5ee,0xa0e,0x340,0x5ee,0x5ee,0x6f9,0x340,0x5ee,0x5ee,0x5ee,0x5f1,0x340,
  #endif /* !U_PLATFORM_IS_DARWIN_BASED */
  0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,
  0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,
@@ -270,12 +270,12 @@ static const uint16_t ubidi_props_trieIndex[10452]={
  0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,
  0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0x31f,
  #else /* !U_PLATFORM_IS_DARWIN_BASED */
-0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0xa13,0xa23,
-0xa1b,0xa1b,0xa1b,0xa24,0xa24,0xa24,0xa24,0x38e,0x38e,0x38e,0x38e,0x38e,0x38e,0x38e,0xa2c,0xa24,
-0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,
-0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,
-0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,
-0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0x31f,
+0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0xa12,0xa22,
+0xa1a,0xa1a,0xa1a,0xa23,0xa23,0xa23,0xa23,0x38e,0x38e,0x38e,0x38e,0x38e,0x38e,0x38e,0xa2b,0xa23,
+0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,
+0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,
+0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,
+0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0x31f,
  #endif /* !U_PLATFORM_IS_DARWIN_BASED */
  0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,8,7,8,9,7,0x12,0x12,
  0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,7,7,7,8,
@@ -715,26 +715,60 @@ static const uint16_t ubidi_props_trieIndex[10452]={
  0xa,0,0,0,0xa,0xa,0xa,0xa,0xa,0,0xa,0xa,0xa,0xa,0xa,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+#if !U_PLATFORM_IS_DARWIN_BASED
  0xa,0xa,0xa,0xa,0xa,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+#else /* !U_PLATFORM_IS_DARWIN_BASED */
+0xa,0xa,0xa,0xa,0xa,0,0,0,0,0,0,0,0,0,0,0xa,
+#endif /* !U_PLATFORM_IS_DARWIN_BASED */
  0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+#if !U_PLATFORM_IS_DARWIN_BASED
  0xa,0xa,0xa,0,0xa,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+#else /* !U_PLATFORM_IS_DARWIN_BASED */
+0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0,
+0xa,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+#endif /* !U_PLATFORM_IS_DARWIN_BASED */
  0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+#if !U_PLATFORM_IS_DARWIN_BASED
  0xa,0xa,0xa,0xa,0,0xa,0xa,0xa,0xa,0,0,0,0xa,0xa,0xa,0xa,
+#else /* !U_PLATFORM_IS_DARWIN_BASED */
+0,0xa,0xa,0xa,0xa,0,0,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+#endif /* !U_PLATFORM_IS_DARWIN_BASED */
  0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+#if !U_PLATFORM_IS_DARWIN_BASED
  0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0,0,0xa,0xa,0xa,0xa,
  0,0,0,0,0,0,0,0,0,0,0,0,0xa,0xa,0xa,0xa,
  0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0,0,0,
  0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0,0,0,0,
+#else /* !U_PLATFORM_IS_DARWIN_BASED */
+0xa,0xa,0xa,0xa,0xa,0xa,0,0,0xa,0xa,0xa,0xa,0,0,0,0,
+0,0,0,0,0,0,0,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0,0,0,0,0xa,0xa,0xa,
+0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0,0,0,0,0,0,0,0,
+#endif /* !U_PLATFORM_IS_DARWIN_BASED */
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+#if !U_PLATFORM_IS_DARWIN_BASED
  0,0,0,0,0,0,0,0,0,0,0x12,0x12,0xb2,0xb2,0xb2,0xb2,
+#else /* !U_PLATFORM_IS_DARWIN_BASED */
+0,0,0,0,0,0,0x12,0x12,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,
+#endif /* !U_PLATFORM_IS_DARWIN_BASED */
  0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,
+#if !U_PLATFORM_IS_DARWIN_BASED
  0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0x12,0xb2,0x12,0x12,
  0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,
  0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,
  0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,
+#else /* !U_PLATFORM_IS_DARWIN_BASED */
+0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0x12,0xb2,0x12,0x12,0x12,0x12,0x12,0x12,
+#endif /* !U_PLATFORM_IS_DARWIN_BASED */
  0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,
+#if !U_PLATFORM_IS_DARWIN_BASED
  0,0,0,0
+#else /* !U_PLATFORM_IS_DARWIN_BASED */
+0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0xb1,0xb1,0xb1,0xb1,
+0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0x12,0x12,0x12,0x12,
+0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0,0,0,0
+#endif /* !U_PLATFORM_IS_DARWIN_BASED */
  };
  
  static const uint32_t ubidi_props_mirrors[26]={
@@ -799,7 +833,7 @@ static const UBiDiProps ubidi_props_singleton={
  #if !U_PLATFORM_IS_DARWIN_BASED
      7124,
  #else /* !U_PLATFORM_IS_DARWIN_BASED */
-    7252,
+    7248,
  #endif /* !U_PLATFORM_IS_DARWIN_BASED */
      0x1a0,
      0xd00,
@@ -809,7 +843,7 @@ static const UBiDiProps ubidi_props_singleton={
  #if !U_PLATFORM_IS_DARWIN_BASED
      0x2850,
  #else /* !U_PLATFORM_IS_DARWIN_BASED */
-    0x28d0,
+    0x28cc,
  #endif /* !U_PLATFORM_IS_DARWIN_BASED */
      NULL, 0, FALSE, FALSE, 0, NULL
    },
diff --git a/icuSources/common/uchar_props_data.h b/icuSources/common/uchar_props_data.h

index 902e02df894fe494d27a7a6c6fee06cb7efae9a1..60af43065ae6ec6a7e8495355fd1756e930874e9 100644 (file)
--- a/icuSources/common/uchar_props_data.h
+++ b/icuSources/common/uchar_props_data.h
@@ -1480,7 +1480,7 @@ static const uint16_t propsTrie_index[18064]={
  0x1b,0,0x1b,0x1b,0x1b,0x1b,0x1b,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0x1b,0x1b,0x1b,0x1b,
  0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0x1b,0,0x1b,0x1b,
+0,0,0,0,0,0,0,0x1a,0x1a,0x1a,0x1a,0x1a,0x1b,0,0x1b,0x1b,
  #endif /* !U_PLATFORM_IS_DARWIN_BASED */
  0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,
  #if !U_PLATFORM_IS_DARWIN_BASED
@@ -3386,7 +3386,7 @@ static const uint16_t propsVectorsTrie_index[23612]={
  0x1266,0x1266,0x1266,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,
  0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,
  0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,
-0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,
+0x1b3,0x1b3,0x1b3,0x136b,0x136b,0x136b,0x136b,0x136b,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,
  0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,
  0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1b3,0x1266,0x1b3,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,
  0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,
@@ -4643,7 +4643,7 @@ static const UTrie2 propsVectorsTrie={
  #if !U_PLATFORM_IS_DARWIN_BASED
  static const uint32_t propsVectors[4917]={
  #else /* !U_PLATFORM_IS_DARWIN_BASED */
-static const uint32_t propsVectors[4971]={
+static const uint32_t propsVectors[4974]={
  #endif /* !U_PLATFORM_IS_DARWIN_BASED */
  0x67,0,0,0x67,0x80000,0x20,0x867,0,0,0xa67,0,0,0xb67,0,0,0xc67,
  0,0,0xd67,0,0,0xe67,0,0,0xf67,0,0,0x1067,0,0,0x1167,0,
@@ -5184,13 +5184,13 @@ static const uint32_t propsVectors[4971]={
  0x6804400,0x962540,0x6100d997,0x7c00100,0x230400,0x6100d997,0xc000010,0x448000,0x6100da98,0x6800000,0x1329800,0x6100da98,0x7c00100,0x230400,0x6100db71,0x4000000,
  0x200000,0x6100dc99,0x2802100,0x962460,0x6100dc99,0x2802400,0x962460,0x6100dc99,0x6800000,0x1329800,0x6100dc99,0x6800100,0x962540,0x6100dc99,0x6804400,0x962540,
  0x6100dc99,0x7c00100,0x230400,0x610a4711,0x7c40300,0xe30000,0x610a4f11,0x7c00300,0xe30001,0x6140af2d,0x6800100,0x962540,0x6180af2d,0x2802400,0x962460,0x62002a00,
-0x4000000,0x1600000,0x63002800,0x80000,0x918820,0x63c00c09,0x80000,0x918820,0x70002a00,0x4000000,0x1600000};
+0x4000000,0x1600000,0x63002800,0x80000,0x918820,0x63c00c09,0x80000,0x918820,0x70002a00,0x4000000,0x1600000,0x8000cd00,0x4000000,0xe00000};
  #endif /* !U_PLATFORM_IS_DARWIN_BASED */
  
  #if !U_PLATFORM_IS_DARWIN_BASED
  static const int32_t countPropsVectors=4917;
  #else /* !U_PLATFORM_IS_DARWIN_BASED */
-static const int32_t countPropsVectors=4971;
+static const int32_t countPropsVectors=4974;
  #endif /* !U_PLATFORM_IS_DARWIN_BASED */
  static const int32_t propsVectorsColumns=3;
  static const uint16_t scriptExtensions[74]={
@@ -5203,6 +5203,6 @@ static const uint16_t scriptExtensions[74]={
  #if !U_PLATFORM_IS_DARWIN_BASED
  static const int32_t indexes[UPROPS_INDEX_COUNT]={0x231a,0x231a,0x231a,0x231a,0x50da,3,0x640f,0x6434,0x6434,0x6434,0xadca0,0x2774191,0,0,0,0};
  #else /* !U_PLATFORM_IS_DARWIN_BASED */
-static const int32_t indexes[UPROPS_INDEX_COUNT]={0x235c,0x235c,0x235c,0x235c,0x517e,3,0x64e9,0x650e,0x650e,0x650e,0xadca0,0x2774191,0,0,0,0};
+static const int32_t indexes[UPROPS_INDEX_COUNT]={0x235c,0x235c,0x235c,0x235c,0x517e,3,0x64ec,0x6511,0x6511,0x6511,0xadca0,0x2774191,0,0,0,0};
  #endif /* !U_PLATFORM_IS_DARWIN_BASED */
  
diff --git a/icuSources/data/brkitr/brkfiles.mk b/icuSources/data/brkitr/brkfiles.mk

index 54e44a8d6696211c547484428617c32d162072ff..681e8dfdadfe0728dda708ee61521f5d25da5d32 100644 (file)
--- a/icuSources/data/brkitr/brkfiles.mk
+++ b/icuSources/data/brkitr/brkfiles.mk
@@ -38,12 +38,14 @@ BRK_DICT_SOURCE = cjdict.txt khmerdict.txt laodict.txt thaidict.txt
  
  
  # List of break iterator files (brk).
-BRK_SOURCE = char.txt line.txt line_fi.txt line_ja.txt\
+BRK_SOURCE = char.txt line.txt line_fi.txt\
+ line_loose.txt line_loose_cj.txt line_loose_fi.txt\
+ line_normal.txt line_normal_cj.txt line_normal_fi.txt\
   sent.txt sent_el.txt title.txt word.txt word_POSIX.txt
  
  
  # Ordinary resources
  BRK_RES_SOURCE = de.txt el.txt en.txt en_US.txt\
   en_US_POSIX.txt es.txt fi.txt fr.txt it.txt\
- ja.txt pt.txt ru.txt
+ ja.txt pt.txt ru.txt zh.txt zh_Hant.txt
  
diff --git a/icuSources/data/brkitr/char.txt b/icuSources/data/brkitr/char.txt

index 58f09ba33d30bfa8828ed23a30d1fcf5cd0e8822..aa5d906c23f479d88db08e561b8ea8c50c1cd9cd 100644 (file)
--- a/icuSources/data/brkitr/char.txt
+++ b/icuSources/data/brkitr/char.txt
@@ -1,5 +1,5 @@
  #
-#   Copyright (C) 2002-2013, International Business Machines Corporation and others.
+#   Copyright (C) 2002-2015, International Business Machines Corporation and others.
  #       All Rights Reserved.
  #
  #   file:  char.txt 
@@ -19,47 +19,58 @@ $Control     = [\p{Grapheme_Cluster_Break = Control}];
  $Extend      = [\p{Grapheme_Cluster_Break = Extend}];
  $SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
  
-$RI_A        = \U0001F1E6;  # Trail ERTU
-$RI_B        = \U0001F1E7;  # Trail EGR
-$RI_C        = \U0001F1E8;  # Trail AHLNZ
+$RI_A        = \U0001F1E6;  # Trail ETU
+$RI_B        = \U0001F1E7;  # Trail ER
+$RI_C        = \U0001F1E8;  # Trail AHLNO
  $RI_D        = \U0001F1E9;  # Trail EK
-$RI_E        = \U0001F1EA;  # Trail GS
+$RI_E        = \U0001F1EA;  # Trail S
  $RI_F        = \U0001F1EB;  # Trail IR
-$RI_G        = \U0001F1EC;  # Trail BR
-$RI_H        = \U0001F1ED;  # Trail KU
-$RI_I        = \U0001F1EE;  # Trail DLNT
+$RI_G        = \U0001F1EC;  # Trail B
+$RI_H        = \U0001F1ED;  # Trail K
+$RI_I        = \U0001F1EE;  # Trail DELNT
  $RI_J        = \U0001F1EF;  # Trail OP
  $RI_K        = \U0001F1F0;  # Trail R
-$RI_L        = \U0001F1F1;  # Trail B
  $RI_M        = \U0001F1F2;  # Trail OXY
-$RI_N        = \U0001F1F3;  # Trail LO
-$RI_P        = \U0001F1F5;  # Trail LT
-$RI_R        = \U0001F1F7;  # Trail OU
-$RI_S        = \U0001F1F8;  # Trail AEGK
-$RI_T        = \U0001F1F9;  # Trail HRW
-$RI_U        = \U0001F1FA;  # Trail AS
+$RI_N        = \U0001F1F3;  # Trail LOZ
+$RI_P        = \U0001F1F5;  # Trail HLRT
+$RI_R        = \U0001F1F7;  # Trail U
+$RI_S        = \U0001F1F8;  # Trail AEG
+$RI_T        = \U0001F1F9;  # Trail R
+$RI_U        = \U0001F1FA;  # Trail S
  $RI_V        = \U0001F1FB;     # Trail N
+$RI_Z        = \U0001F1FF;     # Trail A
+# unused trail values, safe as addl lead: C F J M Q V W
+# unused lead values, safe as addl trail: L O Q W X Y
  
-$RI_A_End    = [\U0001F1EA \U0001F1F7 \U0001F1F9 \U0001F1FA];          # ERTU
-$RI_B_End    = [\U0001F1EA \U0001F1EC \U0001F1F7];              # EGR
-$RI_C_End    = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1FF]; # AHLNZ
+$RI_A_End    = [\U0001F1EA \U0001F1F9 \U0001F1FA];             # ETU
+$RI_B_End    = [\U0001F1EA \U0001F1F7];                         # ER
+$RI_C_End    = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1F4]; # AHLNO
  $RI_D_End    = [\U0001F1EA \U0001F1F0];                         # EK
-$RI_E_End    = [\U0001F1EC \U0001F1F8];                         # GS
+$RI_E_End    = \U0001F1F8;                                      # S
  $RI_F_End    = [\U0001F1EE \U0001F1F7];                         # IR
-$RI_G_End    = [\U0001F1E7 \U0001F1F7];                         # BR
-$RI_H_End    = [\U0001F1F0 \U0001F1FA];                         # KU
-$RI_I_End    = [\U0001F1E9 \U0001F1F1 \U0001F1F3 \U0001F1F9];  # DLNT
-$RI_J_End    = [\U0001F1F4 \U0001F1F5];                         # OP
+$RI_G_End    = \U0001F1E7;                                      # B
+$RI_H_End    = \U0001F1F0;                                      # K
+$RI_I_End    = [\U0001F1E9 \U0001F1EA \U0001F1F1 \U0001F1F3 \U0001F1F9]; # DELNT
+$RI_J_End    = [\U0001F1F5 \U0001F1F4];                         # OP
  $RI_K_End    = \U0001F1F7;                                      # R
-$RI_L_End    = \U0001F1E7;                                      # B
  $RI_M_End    = [\U0001F1F4 \U0001F1FD \U0001F1FE];              # OXY
-$RI_N_End    = [\U0001F1F1 \U0001F1F4];                         # LO
-$RI_P_End    = [\U0001F1F1 \U0001F1F9];                         # LT
-$RI_R_End    = [\U0001F1F4 \U0001F1FA];                         # OU
-$RI_S_End    = [\U0001F1E6 \U0001F1EA \U0001F1EC \U0001F1F0];   # AEGK
-$RI_T_End    = [\U0001F1ED \U0001F1F7 \U0001F1FC];              # HRW
-$RI_U_End    = [\U0001F1E6 \U0001F1F8];                         # AS
+$RI_N_End    = [\U0001F1F1 \U0001F1F4 \U0001F1FF];              # LOZ
+$RI_P_End    = [\U0001F1ED \U0001F1F1 \U0001F1F7 \U0001F1F9];   # HLRT
+$RI_R_End    = \U0001F1FA;                                      # U
+$RI_S_End    = [\U0001F1E6 \U0001F1EA \U0001F1EC];              # AEG
+$RI_T_End    = \U0001F1F7;                                      # R
+$RI_U_End    = \U0001F1F8;                                      # S
  $RI_V_End    = \U0001F1F3;                                      # N
+$RI_Z_End    = \U0001F1E6;                                      # A
+
+# Special character classes for people & body part emoji:
+# Subsets of $Extend:
+$ZWJ = \u200D;
+$EmojiVar = [\uFE0F];
+# The following are subsets of \p{Grapheme_Cluster_Break = Other} which is not otherwise used here
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
  
  #
  # Korean Syllable Definitions
@@ -94,7 +105,6 @@ $RI_H $RI_H_End;
  $RI_I $RI_I_End;
  $RI_J $RI_J_End;
  $RI_K $RI_K_End;
-$RI_L $RI_L_End;
  $RI_M $RI_M_End;
  $RI_N $RI_N_End;
  $RI_P $RI_P_End;
@@ -103,12 +113,18 @@ $RI_S $RI_S_End;
  $RI_T $RI_T_End;
  $RI_U $RI_U_End;
  $RI_V $RI_V_End;
+$RI_Z $RI_Z_End;
  
  [^$Control $CR $LF] $Extend;
  
  [^$Control $CR $LF] $SpacingMark;
  # TODO: Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF];
  
+# Special forward rules for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
+$ZWJ $EmojiForSeqs;
+$EmojiForMods $EmojiVar? $EmojiMods;
+
  
  ## -------------------------------------------------
  
@@ -129,7 +145,6 @@ $RI_H_End $RI_H;
  $RI_I_End $RI_I;
  $RI_J_End $RI_J;
  $RI_K_End $RI_K;
-$RI_L_End $RI_L;
  $RI_M_End $RI_M;
  $RI_N_End $RI_N;
  $RI_P_End $RI_P;
@@ -138,11 +153,16 @@ $RI_S_End $RI_S;
  $RI_T_End $RI_T;
  $RI_U_End $RI_U;
  $RI_V_End $RI_V;
+$RI_Z_End $RI_Z;
  
  $Extend      [^$Control $CR $LF];
  $SpacingMark [^$Control $CR $LF];
  # TODO: Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend;
  
+# Special reverse rules for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
+$EmojiForSeqs $ZWJ;
+$EmojiMods $EmojiVar? $EmojiForMods;
  
  ## -------------------------------------------------
  #  We don't logically need safe char break rules, but if we don't provide any at all
@@ -151,9 +171,11 @@ $SpacingMark [^$Control $CR $LF];
  
  !!safe_reverse;
  $LF $CR;
+[$EmojiVar $EmojiMods]+ $EmojiForMods;
  
  ## -------------------------------------------------
  
  !!safe_forward;
  $CR $LF;
+$EmojiForMods [$EmojiVar $EmojiMods]+;
  
diff --git a/icuSources/data/brkitr/fi.txt b/icuSources/data/brkitr/fi.txt

index 4287a8922519250a01f9345d8b9cbdeeb073638f..1a7aba3349f85b155695b9ac3b0530de195487c4 100644 (file)
--- a/icuSources/data/brkitr/fi.txt
+++ b/icuSources/data/brkitr/fi.txt
@@ -10,5 +10,8 @@ fi{
      Version{"2.0.82.42"}
      boundaries{
          line:process(dependency){"line_fi.brk"}
+        line_loose:process(dependency){"line_loose_fi.brk"}
+        line_normal:process(dependency){"line_normal_fi.brk"}
+        line_strict:process(dependency){"line_fi.brk"}
      }
  }
diff --git a/icuSources/data/brkitr/ja.txt b/icuSources/data/brkitr/ja.txt

index 3eb81d0b1cc45c59b31a80f5fa5425a21c031e49..76c39b7b0192c457cc66d0f63795a5d4dc136f3c 100644 (file)
--- a/icuSources/data/brkitr/ja.txt
+++ b/icuSources/data/brkitr/ja.txt
@@ -9,6 +9,9 @@
  ja{
      Version{"2.0.82.42"}
      boundaries{
-        line:process(dependency){"line_ja.brk"}
+        line:process(dependency){"line_normal.brk"}
+        line_loose:process(dependency){"line_loose_cj.brk"}
+        line_normal:process(dependency){"line_normal_cj.brk"}
+        line_strict:process(dependency){"line.brk"}
      }
  }
diff --git a/icuSources/data/brkitr/line.txt b/icuSources/data/brkitr/line.txt

index 74560af044d10bf02117d09fb0e54b8cf99d1a49..8447d51435f7b5a8b3b449831a43f77dfa7e2061 100644 (file)
--- a/icuSources/data/brkitr/line.txt
+++ b/icuSources/data/brkitr/line.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2002-2013  International Business Machines Corporation and
+# Copyright (c) 2002-2015  International Business Machines Corporation and
  # others. All Rights Reserved.
  #
  #  file:  line.txt
@@ -11,6 +11,9 @@
  #         TODO:  Rule LB 8 remains as it was in Unicode 5.2
  #         This is only because of a limitation of ICU break engine implementation,
  #         not because the older behavior is desirable.
+#
+#         This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
+#         It sets characters of class CJ to behave like NS.
  
  #
  #  Character Classes defined by TR 14.
@@ -97,6 +100,15 @@ $WJ = [:LineBreak =  Word_Joiner:];
  $XX = [:LineBreak =  Unknown:];
  $ZW = [:LineBreak =  ZWSpace:];
  
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
  #   Dictionary character set, for triggering language-based break engines. Currently
  #   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
  #   5.0 or later as the definition of Complex_Context was corrected to include all
@@ -231,6 +243,10 @@ $LB8Breaks    = [$LB4Breaks $ZW];
  $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
  
  
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
  # LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL 
  #                                $CM not covered by the above needs to behave like $AL   
  #                                See definition of $CAN_CM.
@@ -402,6 +418,10 @@ $CPcm ($ALcm | $HLcm | $NUcm);
  # LB 30a  Do not break between regional indicators.
  $RIcm $RIcm;
  
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
  #
  #  Reverse Rules.
  #
@@ -484,6 +504,10 @@ $LF $CR;
  #           Requires an engine enhancement.
  #   / $SP* $ZW
  
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
  # LB 9,10  Combining marks.
  #    X   $CM needs to behave like X, where X is not $SP or controls.
  #    $CM not covered by the above needs to behave like $AL
@@ -626,6 +650,10 @@ $CM* ($ALPlus | $HL | $NU) $CM* $CP;
  # LB 30a
  $CM* $RI $CM* $RI;
  
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
  ## -------------------------------------------------
  
  !!safe_reverse;
diff --git a/icuSources/data/brkitr/line_fi.txt b/icuSources/data/brkitr/line_fi.txt

index adf78bd3887404c8c429c6f4fb16167f11501db9..8a6287bc7fb6706b81876daee29d407b282d0c2a 100644 (file)
--- a/icuSources/data/brkitr/line_fi.txt
+++ b/icuSources/data/brkitr/line_fi.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2002-2013  International Business Machines Corporation and
+# Copyright (c) 2002-2015  International Business Machines Corporation and
  # others. All Rights Reserved.
  #
  #  file:  line_fi.txt
@@ -7,10 +7,15 @@
  #         Implement default line breaking as defined by 
  #         Unicode Standard Annex #14 Revision 29 for Unicode 6.2
  #         http://www.unicode.org/reports/tr14/
+#         tailored as noted in 2nd paragraph below..
  #
  #         TODO:  Rule LB 8 remains as it was in Unicode 5.2
  #         This is only because of a limitation of ICU break engine implementation,
  #         not because the older behavior is desirable.
+#
+#         This tailors the line break behavior for Finnish, while otherwise behaving
+#         per UAX 14 which corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
+#         It sets characters of class CJ to behave like NS.
  
  #
  #  Character Classes defined by TR 14.
@@ -98,6 +103,15 @@ $WJ = [:LineBreak =  Word_Joiner:];
  $XX = [:LineBreak =  Unknown:];
  $ZW = [:LineBreak =  ZWSpace:];
  
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
  #   Dictionary character set, for triggering language-based break engines. Currently
  #   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
  #   5.0 or later as the definition of Complex_Context was corrected to include all
@@ -234,6 +248,10 @@ $LB8Breaks    = [$LB4Breaks $ZW];
  $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
  
  
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
  # LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL 
  #                                $CM not covered by the above needs to behave like $AL   
  #                                See definition of $CAN_CM.
@@ -408,6 +426,10 @@ $CPcm ($ALcm | $HLcm | $NUcm);
  # LB 30a  Do not break between regional indicators.
  $RIcm $RIcm;
  
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
  #
  #  Reverse Rules.
  #
@@ -491,6 +513,10 @@ $LF $CR;
  #           Requires an engine enhancement.
  #   / $SP* $ZW
  
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
  # LB 9,10  Combining marks.
  #    X   $CM needs to behave like X, where X is not $SP or controls.
  #    $CM not covered by the above needs to behave like $AL
@@ -636,6 +662,10 @@ $CM* ($ALPlus | $HL | $NU) $CM* $CP;
  # LB 30a
  $CM* $RI $CM* $RI;
  
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
  ## -------------------------------------------------
  
  !!safe_reverse;
diff --git a/icuSources/data/brkitr/line_ja.txt b/icuSources/data/brkitr/line_ja.txt

index 70b203d1b0d26afae1aa80d16797c0c40be68a8b..ef364c14df312b8b38b0176641c60d9ab81e4f9c 100644 (file)
--- a/icuSources/data/brkitr/line_ja.txt
+++ b/icuSources/data/brkitr/line_ja.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2002-2013  International Business Machines Corporation and
+# Copyright (c) 2002-2015  International Business Machines Corporation and
  # others. All Rights Reserved.
  #
  #  file:  line_ja.txt
@@ -97,6 +97,15 @@ $WJ = [:LineBreak =  Word_Joiner:];
  $XX = [:LineBreak =  Unknown:];
  $ZW = [:LineBreak =  ZWSpace:];
  
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
  #   Dictionary character set, for triggering language-based break engines. Currently
  #   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
  #   5.0 or later as the definition of Complex_Context was corrected to include all
@@ -231,6 +240,10 @@ $LB8Breaks    = [$LB4Breaks $ZW];
  $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
  
  
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
  # LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL 
  #                                $CM not covered by the above needs to behave like $AL   
  #                                See definition of $CAN_CM.
@@ -402,6 +415,10 @@ $CPcm ($ALcm | $HLcm | $NUcm);
  # LB 30a  Do not break between regional indicators.
  $RIcm $RIcm;
  
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
  #
  #  Reverse Rules.
  #
@@ -484,6 +501,10 @@ $LF $CR;
  #           Requires an engine enhancement.
  #   / $SP* $ZW
  
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
  # LB 9,10  Combining marks.
  #    X   $CM needs to behave like X, where X is not $SP or controls.
  #    $CM not covered by the above needs to behave like $AL
@@ -626,6 +647,10 @@ $CM* ($ALPlus | $HL | $NU) $CM* $CP;
  # LB 30a
  $CM* $RI $CM* $RI;
  
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
  ## -------------------------------------------------
  
  !!safe_reverse;
diff --git a/icuSources/data/brkitr/line_loose.txt b/icuSources/data/brkitr/line_loose.txt

new file mode 100644 (file)

index 0000000..e2f0765
--- /dev/null
+++ b/icuSources/data/brkitr/line_loose.txt
@@ -0,0 +1,717 @@
+# Copyright (c) 2002-2015  International Business Machines Corporation and
+# others. All Rights Reserved.
+#
+#  file:  line_loose.txt
+#
+#         Line Breaking Rules
+#         Implement default line breaking as defined by 
+#         Unicode Standard Annex #14 Revision 29 for Unicode 6.2
+#         http://www.unicode.org/reports/tr14/
+#         tailored as noted in 2nd paragraph below..
+#
+#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
+#         This is only because of a limitation of ICU break engine implementation,
+#         not because the older behavior is desirable.
+#
+#         This tailors the line break behavior to correspond to CSS
+#         line-break=loose (BCP47 -u-lb-loose) as defined for languages other than 
+#         Chinese & Japanese.
+#         It sets characters of class CJ to behave like ID.
+#         In addition, it allows breaks:
+#         * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
+#         * between characters of LineBreak class IN
+
+#
+#  Character Classes defined by TR 14.
+#
+
+!!chain;
+!!LBCMNoChain;
+
+
+!!lookAheadHardBreak;
+#
+#  !!lookAheadHardBreak    Described here because it is (as yet) undocumented elsewhere
+#                          and only used for the line break rules.
+#
+#           It is used in the implementation of rule LB 10
+#           which says to treat any combining mark that is not attached to a base
+#           character as if it were of class AL  (alphabetic).
+#
+#           The problem occurs in the reverse rules.
+#
+#           Consider a sequence like, with correct breaks as shown
+#               LF  ID  CM  AL  AL
+#                  ^       ^       ^
+#           Then consider the sequence without the initial ID (ideographic)
+#                 LF  CM  AL  AL
+#                    ^           ^
+#           Our CM, which in the first example was attached to the ideograph,
+#           is now unattached, becomes an alpha, and joins in with the other
+#           alphas.
+#
+#           When iterating forwards, these sequences do not present any problems
+#           When iterating backwards, we need to look ahead when encountering
+#           a CM to see whether it attaches to something further on or not.
+#           (Look-ahead in a reverse rule is looking towards the start)
+#
+#           If the CM is unattached, we need to force a break.
+#
+#           !!lookAheadHardBreak forces the run time state machine to
+#           stop immediately when a look ahead rule ( '/' operator) matches,
+#           and set the match position to that of the look-ahead operator,
+#           no matter what other rules may be in play at the time.
+#
+#           See rule LB 19 for an example.
+#
+
+$AI = [:LineBreak =  Ambiguous:];
+$AL = [:LineBreak =  Alphabetic:];
+$BA = [:LineBreak =  Break_After:];
+$BB = [:LineBreak =  Break_Before:];
+$BK = [:LineBreak =  Mandatory_Break:];
+$B2 = [:LineBreak =  Break_Both:];
+$CB = [:LineBreak =  Contingent_Break:];
+$CJ = [:LineBreak =  Conditional_Japanese_Starter:];
+$CL = [:LineBreak =  Close_Punctuation:];
+$CM = [:LineBreak =  Combining_Mark:];
+$CP = [:LineBreak =  Close_Parenthesis:];
+$CR = [:LineBreak =  Carriage_Return:];
+$EX = [:LineBreak =  Exclamation:];
+$GL = [:LineBreak =  Glue:];
+$HL = [:LineBreak =  Hebrew_Letter:];
+$HY = [:LineBreak =  Hyphen:];
+$H2 = [:LineBreak =  H2:];
+$H3 = [:LineBreak =  H3:];
+$ID = [[:LineBreak =  Ideographic:] $CJ];
+$IN = [:LineBreak =  Inseperable:];
+$IS = [:LineBreak =  Infix_Numeric:];
+$JL = [:LineBreak =  JL:];
+$JV = [:LineBreak =  JV:];
+$JT = [:LineBreak =  JT:];
+$LF = [:LineBreak =  Line_Feed:];
+$NL = [:LineBreak =  Next_Line:];
+$NSX = [\u3005 \u303B \u309D \u309E \u30FD \u30FE];
+$NS = [[:LineBreak =  Nonstarter:] - $NSX];
+$NU = [:LineBreak =  Numeric:];
+$OP = [:LineBreak =  Open_Punctuation:];
+$PO = [:LineBreak =  Postfix_Numeric:];
+$PR = [:LineBreak =  Prefix_Numeric:];
+$QU = [:LineBreak =  Quotation:];
+$RI = [:LineBreak =  Regional_Indicator:];
+$SA = [:LineBreak =  Complex_Context:];
+$SG = [:LineBreak =  Surrogate:];
+$SP = [:LineBreak =  Space:];
+$SY = [:LineBreak =  Break_Symbols:];
+$WJ = [:LineBreak =  Word_Joiner:];
+$XX = [:LineBreak =  Unknown:];
+$ZW = [:LineBreak =  ZWSpace:];
+
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
+#   Dictionary character set, for triggering language-based break engines. Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include all
+#   characters requiring dictionary break.
+
+$dictionary = [:LineBreak = Complex_Context:];
+
+#
+#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
+#                               SA  (South East Asian: Thai, Lao, Khmer)
+#                               SG  (Unpaired Surrogates)
+#                               XX  (Unknown, unassigned)
+#                         as $AL  (Alphabetic)
+#
+$ALPlus = [$AL $AI $SA $SG $XX];
+
+#
+#  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
+#
+$ALcm = $ALPlus $CM*;
+$BAcm = $BA $CM*;
+$BBcm = $BB $CM*;
+$B2cm = $B2 $CM*;
+$CLcm = $CL $CM*;
+$CPcm = $CP $CM*;
+$EXcm = $EX $CM*;
+$GLcm = $GL $CM*;
+$HLcm = $HL $CM*;
+$HYcm = $HY $CM*;
+$H2cm = $H2 $CM*;
+$H3cm = $H3 $CM*;
+$IDcm = $ID $CM*;
+$INcm = $IN $CM*;
+$IScm = $IS $CM*;
+$JLcm = $JL $CM*;
+$JVcm = $JV $CM*;
+$JTcm = $JT $CM*;
+$NScm = $NS $CM*;
+$NSXcm = $NSX $CM*;
+$NUcm = $NU $CM*;
+$OPcm = $OP $CM*;
+$POcm = $PO $CM*;
+$PRcm = $PR $CM*;
+$QUcm = $QU $CM*;
+$RIcm = $RI $CM*;
+$SYcm = $SY $CM*;
+$WJcm = $WJ $CM*;
+
+## -------------------------------------------------
+
+!!forward;
+
+#
+#  Each class of character can stand by itself as an unbroken token, with trailing combining stuff
+#
+$ALPlus $CM+;
+$BA $CM+;
+$BB $CM+;
+$B2 $CM+;
+$CL $CM+;
+$CP $CM+;
+$EX $CM+;
+$GL $CM+;
+$HL $CM+;
+$HY $CM+;
+$H2 $CM+;
+$H3 $CM+;
+$ID $CM+;
+$IN $CM+;
+$IS $CM+;
+$JL $CM+;
+$JV $CM+;
+$JT $CM+;
+$NS $CM+;
+$NSX $CM+;
+$NU $CM+;
+$OP $CM+;
+$PO $CM+;
+$PR $CM+;
+$QU $CM+;
+$RI $CM+;
+$SY $CM+;
+$WJ $CM+;
+
+#
+# CAN_CM  is the set of characters that may combine with CM combining chars.
+#         Note that Linebreak UAX 14's concept of a combining char and the rules
+#         for what they can combine with are _very_ different from the rest of Unicode.
+#
+#         Note that $CM itself is left out of this set.  If CM is needed as a base
+#         it must be listed separately in the rule.
+#
+$CAN_CM  = [^$SP $BK $CR $LF $NL $ZW $CM];       # Bases that can   take CMs
+$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM];       # Bases that can't take CMs
+
+#
+# AL_FOLLOW  set of chars that can unconditionally follow an AL
+#            Needed in rules where stand-alone $CM s are treated as AL.
+#            Chaining is disabled with CM because it causes other failures,
+#            so for this one case we need to manually list out longer sequences.
+#
+$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
+$AL_FOLLOW_CM   = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $NSX $IN $NU $ALPlus];
+$AL_FOLLOW      = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
+
+
+#
+#  Rule LB 4, 5    Mandatory (Hard) breaks.
+#
+$LB4Breaks    = [$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL];
+$CR $LF {100};
+
+#
+#  LB 6    Do not break before hard line breaks.
+#
+$LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
+$CAN_CM $CM*    $LB4Breaks {100};
+$CM+            $LB4Breaks {100};
+
+# LB 7         x SP
+#              x ZW
+$LB4NonBreaks [$SP $ZW];
+$CAN_CM $CM*  [$SP $ZW];
+$CM+          [$SP $ZW];
+
+#
+# LB 8         Break after zero width space
+#              TODO:  ZW SP* <break>
+#              An engine change is required to write the reverse rule for this.
+#              For now, leave the Unicode 5.2 rule, ZW <break>
+#
+$LB8Breaks    = [$LB4Breaks $ZW];
+$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+
+
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
+# LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL 
+#                                $CM not covered by the above needs to behave like $AL   
+#                                See definition of $CAN_CM.
+
+$CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
+$CM+;
+
+#
+# LB 11  Do not break before or after WORD JOINER & related characters.
+#
+$CAN_CM $CM*  $WJcm;
+$LB8NonBreaks $WJcm;
+$CM+          $WJcm;
+
+$WJcm $CANT_CM;
+$WJcm $CAN_CM $CM*;
+
+#
+# LB 12  Do not break after NBSP and related characters.
+#         GL  x
+#
+$GLcm $CAN_CM $CM*;
+$GLcm $CANT_CM;
+ 
+#
+# LB 12a  Do not break before NBSP and related characters ...
+#            [^SP BA HY] x GL
+#
+[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
+$CM+ GLcm;
+
+
+
+#
+# LB 13   Don't break before ']' or '!' or ';' or '/', even after spaces.
+#
+$LB8NonBreaks $CL;
+$CAN_CM $CM*  $CL;
+$CM+          $CL;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $CP;
+$CAN_CM $CM*  $CP;
+$CM+          $CP;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $EX;
+$CAN_CM $CM*  $EX;
+$CM+          $EX;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $IS;
+$CAN_CM $CM*  $IS;
+$CM+          $IS;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $SY;
+$CAN_CM $CM*  $SY;
+$CM+          $SY;              # by rule 10, stand-alone CM behaves as AL
+
+
+#
+# LB 14  Do not break after OP, even after spaces
+#
+$OPcm $SP* $CAN_CM $CM*;
+$OPcm $SP* $CANT_CM;
+
+$OPcm $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
+
+# LB 15
+$QUcm $SP* $OPcm;
+
+# LB 16
+# Do not break between closing punctuation and $NS, even with intervening spaces
+# But DO allow a break between closing punctuation and $NSX, don't include it here
+($CLcm | $CPcm) $SP* $NScm;
+
+# LB 17
+$B2cm $SP* $B2cm;
+
+#
+# LB 18  Break after spaces.
+#
+$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
+$LB18Breaks    = [$LB8Breaks $SP];
+
+
+# LB 19
+#         x QU
+$LB18NonBreaks $CM* $QUcm;
+$CM+                $QUcm;
+
+#         QU  x
+$QUcm .?;
+$QUcm $LB18NonBreaks $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
+                              #  TODO:  I don't think this rule is needed.
+
+
+# LB 20
+#        <break>  $CB
+#        $CB   <break>
+
+$LB20NonBreaks = [$LB18NonBreaks - $CB];
+
+# LB 21        x   (BA | HY | NS)
+#           BB x
+#
+# DO allow breaks here before NSXcm, so don't include it
+$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); 
+
+$BBcm [^$CB];                                  #  $BB  x
+$BBcm $LB20NonBreaks $CM*;
+
+# LB 21a Don't break after Hebrew + Hyphen
+#   HL (HY | BA) x
+#  
+$HLcm ($HYcm | $BAcm) [^$CB]?;
+
+# LB 21b (forward) Don't break between SY and HL
+# (break between HL and SY already disallowed by LB 13 above)
+$SYcm $HLcm;
+
+# LB 22
+($ALcm | $HLcm) $INcm;
+$CM+     $INcm;     #  by rule 10, any otherwise unattached CM behaves as AL
+$IDcm    $INcm;
+# $INcm  $INcm; # delete this rule for CSS loose
+$NUcm    $INcm;
+
+
+# $LB 23
+$IDcm  $POcm;
+$ALcm  $NUcm;       # includes $LB19
+$HLcm  $NUcm;
+$CM+   $NUcm;       # Rule 10, any otherwise unattached CM behaves as AL
+$NUcm  $ALcm;
+$NUcm  $HLcm;
+
+#
+# LB 24
+#
+$PRcm $IDcm;
+$PRcm ($ALcm | $HLcm);
+$POcm ($ALcm | $HLcm);
+
+#
+# LB 25   Numbers.
+#
+($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
+
+# LB 26  Do not break a Korean syllable
+#
+$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
+($JVcm | $H2cm) ($JVcm | $JTcm);
+($JTcm | $H3cm) $JTcm;
+
+# LB 27  Treat korean Syllable Block the same as ID  (don't break it)
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
+$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
+
+
+# LB 28   Do not break between alphabetics
+#
+($ALcm | $HLcm) ($ALcm | $HLcm);
+$CM+ ($ALcm | $HLcm);      # The $CM+ is from rule 10, an unattached CM is treated as AL
+
+# LB 29
+$IScm ($ALcm | $HLcm);
+
+# LB 30
+($ALcm | $HLcm | $NUcm) $OPcm;
+$CM+ $OPcm;         # The $CM+ is from rule 10, an unattached CM is treated as AL.          
+$CPcm ($ALcm | $HLcm | $NUcm);
+
+# LB 30a  Do not break between regional indicators.
+$RIcm $RIcm;
+
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
+#
+#  Reverse Rules.
+#
+## -------------------------------------------------
+
+!!reverse;
+
+$CM+ $ALPlus;
+$CM+ $BA;
+$CM+ $BB;
+$CM+ $B2;
+$CM+ $CL;
+$CM+ $CP;
+$CM+ $EX;
+$CM+ $GL;
+$CM+ $HL;
+$CM+ $HY;
+$CM+ $H2;
+$CM+ $H3;
+$CM+ $ID;
+$CM+ $IN;
+$CM+ $IS;
+$CM+ $JL;
+$CM+ $JV;
+$CM+ $JT;
+$CM+ $NS;
+$CM+ $NSX;
+$CM+ $NU;
+$CM+ $OP;
+$CM+ $PO;
+$CM+ $PR;
+$CM+ $QU;
+$CM+ $RI;
+$CM+ $SY;
+$CM+ $WJ;
+$CM+;
+
+
+#
+#  Sequences of the form  (shown forwards)
+#      [CANT_CM]  <break>  [CM]  [whatever]
+#  The CM needs to behave as an AL
+#
+$AL_FOLLOW $CM+ / (
+          [$BK $CR $LF $NL $ZW {eof}] |
+          $SP+ $CM+ $SP |
+          $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}]));   # if LB 14 will match, need to surpress this break.
+                                               #  LB14 says    OP SP* x .        
+                                               #    becomes    OP SP* x AL
+                                               #    becomes    OP SP* x CM+ AL_FOLLOW
+                                               #
+                                               # Further note:  the $AL in [$AL {eof}] is only to work around
+                                               #                a rule compiler bug which complains about
+                                               #                empty sets otherwise.
+          
+#
+#  Sequences of the form  (shown forwards)
+#      [CANT_CM]  <break> [CM]  <break>  [PR]
+#  The CM needs to behave as an AL
+#  This rule is concerned about getting the second of the two <breaks> in place.
+#
+
+[$PR   ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
+
+
+
+# LB 4, 5, 5
+
+$LB4Breaks [$LB4NonBreaks-$CM];
+$LB4Breaks $CM+ $CAN_CM;
+$LF $CR;
+
+
+# LB 7         x SP
+#              x ZW
+[$SP $ZW] [$LB4NonBreaks-$CM];
+[$SP $ZW] $CM+ $CAN_CM;
+
+# LB 8 ZW SP* <break>
+#     TODO: to implement this, we need more than one look-ahead hard break in play at a time.
+#           Requires an engine enhancement.
+#   / $SP* $ZW
+
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
+# LB 9,10  Combining marks.
+#    X   $CM needs to behave like X, where X is not $SP or controls.
+#    $CM not covered by the above needs to behave like $AL
+# Stick together any combining sequences that don't match other rules.
+$CM+ $CAN_CM;
+
+
+# LB 11
+$CM* $WJ $CM* $CAN_CM;
+$CM* $WJ      [$LB8NonBreaks-$CM];
+
+     $CANT_CM $CM* $WJ;
+$CM* $CAN_CM  $CM* $WJ;
+
+# LB 12a
+#      [^SP BA HY] x GL
+#
+$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
+
+# LB 12
+#     GL  x
+#
+$CANT_CM $CM* $GL;
+$CM* $CAN_CM $CM* $GL;
+
+
+# LB 13
+$CL $CM+ $CAN_CM;
+$CP $CM+ $CAN_CM;
+$EX $CM+ $CAN_CM;
+$IS $CM+ $CAN_CM;
+$SY $CM+ $CAN_CM;
+
+$CL [$LB8NonBreaks-$CM];
+$CP [$LB8NonBreaks-$CM];
+$EX [$LB8NonBreaks-$CM];
+$IS [$LB8NonBreaks-$CM];
+$SY [$LB8NonBreaks-$CM];
+
+# Rule 13 & 14 taken together for an edge case.
+#   Match this, shown forward
+#     OP SP+  ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
+#   This really wants to chain at the $CM+ (which is acting as an $AL)
+#   except for $CM chaining being disabled.
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;  
+
+# LB 14    OP SP* x
+#
+$CM* $CAN_CM    $SP* $CM* $OP;
+     $CANT_CM   $SP* $CM* $OP;
+$AL_FOLLOW? $CM+  $SP $SP* $CM* $OP;     #  by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
+     
+     $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$CM* $AL_FOLLOW_CM   $CM+ $SP+ $CM* $OP;
+$SY $CM $SP+ $OP;   # TODO:  Experiment.  Remove.
+
+
+
+# LB 15
+$CM* $OP $SP* $CM* $QU;
+
+# LB 16
+# Don't include $NSX here
+$CM* $NS $SP* $CM* ($CL | $CP);
+
+# LB 17
+$CM* $B2 $SP* $CM* $B2;
+
+# LB 18  break after spaces
+#        Nothing explicit needed here.
+
+
+#
+# LB 19
+#
+$CM* $QU $CM* $CAN_CM;                                #   . x QU
+$CM* $QU      $LB18NonBreaks;
+
+
+$CM* $CAN_CM  $CM* $QU;                               #   QU x .
+     $CANT_CM $CM* $QU;
+     
+#
+#  LB 20  Break before and after CB.
+#         nothing needed here.
+#
+
+# LB 21
+# Don't include $NSX here
+$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM];     #  . x (BA | HY | NS)
+
+$CM* [$LB20NonBreaks-$CM] $CM* $BB;                   #  BB x .
+[^$CB] $CM* $BB;                                      # 
+
+# LB21a
+[^$CB] $CM* ($HY | $BA) $CM* $HL;
+
+# LB21b (reverse)
+$CM* $HL $CM* $SY;
+
+# LB 22
+$CM* $IN $CM* ($ALPlus | $HL);
+$CM* $IN $CM* $ID;
+# $CM* $IN $CM* $IN; # delete this rule for CSS loose
+$CM* $IN $CM* $NU;
+
+# LB 23
+$CM* $PO $CM* $ID;
+$CM* $NU $CM* ($ALPlus | $HL);
+$CM* ($ALPlus | $HL) $CM* $NU;
+
+# LB 24
+$CM* $ID $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PO;
+
+
+# LB 25
+($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
+
+# LB 26
+$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
+$CM* ($JT | $JV) $CM* ($H2 | $JV);
+$CM* $JT $CM* ($H3 | $JT);
+
+# LB 27
+$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+
+# LB 28
+$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+
+
+# LB 29
+$CM* ($ALPlus | $HL) $CM* $IS;
+
+# LB 30
+$CM* $OP $CM* ($ALPlus | $HL | $NU);
+$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+
+# LB 30a
+$CM* $RI $CM* $RI;
+
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# LB 9
+$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+$CM+ $SP / .;
+
+# LB 14
+$SP+ $CM* $OP;
+
+# LB 15
+$SP+ $CM* $QU;
+
+# LB 16
+$SP+ $CM* ($CL | $CP);
+
+# LB 17
+$SP+ $CM* $B2;
+
+# LB 21
+$CM* ($HY | $BA) $CM* $HL;
+
+# LB 25
+($CM* ($IS | $SY))+ $CM* $NU;
+($CL | $CP) $CM* ($NU | $IS | $SY);
+
+# For dictionary-based break
+$dictionary $dictionary;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# Skip forward over all character classes that are involved in
+#   rules containing patterns with possibly more than one char
+#   of context.
+#
+#  It might be slightly more efficient to have specific rules
+#  instead of one generic one, but only if we could
+#  turn off rule chaining.  We don't want to move more
+#  than necessary.
+#
+[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
+$dictionary $dictionary;
+
diff --git a/icuSources/data/brkitr/line_loose_cj.txt b/icuSources/data/brkitr/line_loose_cj.txt

new file mode 100644 (file)

index 0000000..3943d36
--- /dev/null
+++ b/icuSources/data/brkitr/line_loose_cj.txt
@@ -0,0 +1,750 @@
+# Copyright (c) 2002-2015  International Business Machines Corporation and
+# others. All Rights Reserved.
+#
+#  file:  line_loose_cj.txt
+#
+#         Line Breaking Rules
+#         Implement default line breaking as defined by 
+#         Unicode Standard Annex #14 Revision 29 for Unicode 6.2
+#         http://www.unicode.org/reports/tr14/
+#         tailored as noted in 2nd paragraph below..
+#
+#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
+#         This is only because of a limitation of ICU break engine implementation,
+#         not because the older behavior is desirable.
+#
+#         This tailors the line break behavior to correspond to CSS
+#         line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
+#         It sets characters of class CJ to behave like ID.
+#         In addition, it allows breaks:
+#         * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+#         * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
+#         * between characters of LineBreak class IN such as 2026
+#         * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
+#           FF65 (all NS) and FF01, FF1F (both EX).
+#         * before suffix characters with LineBreak class PO and EastAsianWidth A,F,W;
+#           this includes: 00B0 2030 2032 2033 2035 2103 2109 FE6A FF05 FFE0
+#         * after prefix characters with LineBreak class PR and EastAsianWidth A,F,W;
+#           this includes: 00A4 00B1 20AC 2116 FE69 FF04 FFE1 FFE5 FFE6
+
+
+#
+#  Character Classes defined by TR 14.
+#
+
+!!chain;
+!!LBCMNoChain;
+
+
+!!lookAheadHardBreak;
+#
+#  !!lookAheadHardBreak    Described here because it is (as yet) undocumented elsewhere
+#                          and only used for the line break rules.
+#
+#           It is used in the implementation of rule LB 10
+#           which says to treat any combining mark that is not attached to a base
+#           character as if it were of class AL  (alphabetic).
+#
+#           The problem occurs in the reverse rules.
+#
+#           Consider a sequence like, with correct breaks as shown
+#               LF  ID  CM  AL  AL
+#                  ^       ^       ^
+#           Then consider the sequence without the initial ID (ideographic)
+#                 LF  CM  AL  AL
+#                    ^           ^
+#           Our CM, which in the first example was attached to the ideograph,
+#           is now unattached, becomes an alpha, and joins in with the other
+#           alphas.
+#
+#           When iterating forwards, these sequences do not present any problems
+#           When iterating backwards, we need to look ahead when encountering
+#           a CM to see whether it attaches to something further on or not.
+#           (Look-ahead in a reverse rule is looking towards the start)
+#
+#           If the CM is unattached, we need to force a break.
+#
+#           !!lookAheadHardBreak forces the run time state machine to
+#           stop immediately when a look ahead rule ( '/' operator) matches,
+#           and set the match position to that of the look-ahead operator,
+#           no matter what other rules may be in play at the time.
+#
+#           See rule LB 19 for an example.
+#
+
+$AI = [:LineBreak =  Ambiguous:];
+$AL = [:LineBreak =  Alphabetic:];
+$BAX = [\u2010 \u2013];
+$BA = [[:LineBreak =  Break_After:] - $BAX];
+$BB = [:LineBreak =  Break_Before:];
+$BK = [:LineBreak =  Mandatory_Break:];
+$B2 = [:LineBreak =  Break_Both:];
+$CB = [:LineBreak =  Contingent_Break:];
+$CJ = [:LineBreak =  Conditional_Japanese_Starter:];
+$CL = [:LineBreak =  Close_Punctuation:];
+$CM = [:LineBreak =  Combining_Mark:];
+$CP = [:LineBreak =  Close_Parenthesis:];
+$CR = [:LineBreak =  Carriage_Return:];
+$EXX = [\uFF01 \uFF1F];
+$EX = [[:LineBreak =  Exclamation:] - $EXX];
+$GL = [:LineBreak =  Glue:];
+$HL = [:LineBreak =  Hebrew_Letter:];
+$HY = [:LineBreak =  Hyphen:];
+$H2 = [:LineBreak =  H2:];
+$H3 = [:LineBreak =  H3:];
+$ID = [[:LineBreak =  Ideographic:] $CJ];
+$IN = [:LineBreak =  Inseperable:];
+$IS = [:LineBreak =  Infix_Numeric:];
+$JL = [:LineBreak =  JL:];
+$JV = [:LineBreak =  JV:];
+$JT = [:LineBreak =  JT:];
+$LF = [:LineBreak =  Line_Feed:];
+$NL = [:LineBreak =  Next_Line:];
+$NSX = [\u301C \u30A0 \u3005 \u303B \u309D \u309E \u30FD \u30FE \u203C \u2047 \u2048 \u2049 \u30FB \uFF1A \uFF1B \uFF65];
+$NS = [[:LineBreak =  Nonstarter:] - $NSX];
+$NU = [:LineBreak =  Numeric:];
+$OP = [:LineBreak =  Open_Punctuation:];
+$POX = [\u00B0 \u2030 \u2032 \u2033 \u2035 \u2103 \u2109 \uFE6A \uFF05 \uFFE0];
+$PO = [[:LineBreak =  Postfix_Numeric:] - $POX];
+$PRX = [\u00A4 \u00B1 \u20AC \u2116 \uFE69 \uFF04 \uFFE1 \uFFE5 \uFFE6];
+$PR = [[:LineBreak =  Prefix_Numeric:] - $PRX];
+$QU = [:LineBreak =  Quotation:];
+$RI = [:LineBreak =  Regional_Indicator:];
+$SA = [:LineBreak =  Complex_Context:];
+$SG = [:LineBreak =  Surrogate:];
+$SP = [:LineBreak =  Space:];
+$SY = [:LineBreak =  Break_Symbols:];
+$WJ = [:LineBreak =  Word_Joiner:];
+$XX = [:LineBreak =  Unknown:];
+$ZW = [:LineBreak =  ZWSpace:];
+
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
+#   Dictionary character set, for triggering language-based break engines. Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include all
+#   characters requiring dictionary break.
+
+$dictionary = [:LineBreak = Complex_Context:];
+
+#
+#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
+#                               SA  (South East Asian: Thai, Lao, Khmer)
+#                               SG  (Unpaired Surrogates)
+#                               XX  (Unknown, unassigned)
+#                         as $AL  (Alphabetic)
+#
+$ALPlus = [$AL $AI $SA $SG $XX];
+
+#
+#  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
+#
+$ALcm = $ALPlus $CM*;
+$BAcm = $BA $CM*;
+$BAXcm = $BAX $CM*;
+$BBcm = $BB $CM*;
+$B2cm = $B2 $CM*;
+$CLcm = $CL $CM*;
+$CPcm = $CP $CM*;
+$EXcm = $EX $CM*;
+$EXXcm = $EXX $CM*;
+$GLcm = $GL $CM*;
+$HLcm = $HL $CM*;
+$HYcm = $HY $CM*;
+$H2cm = $H2 $CM*;
+$H3cm = $H3 $CM*;
+$IDcm = $ID $CM*;
+$INcm = $IN $CM*;
+$IScm = $IS $CM*;
+$JLcm = $JL $CM*;
+$JVcm = $JV $CM*;
+$JTcm = $JT $CM*;
+$NScm = $NS $CM*;
+$NSXcm = $NSX $CM*;
+$NUcm = $NU $CM*;
+$OPcm = $OP $CM*;
+$POcm = $PO $CM*;
+$POXcm = $POX $CM*;
+$PRcm = $PR $CM*;
+$PRXcm = $PRX $CM*;
+$QUcm = $QU $CM*;
+$RIcm = $RI $CM*;
+$SYcm = $SY $CM*;
+$WJcm = $WJ $CM*;
+
+## -------------------------------------------------
+
+!!forward;
+
+#
+#  Each class of character can stand by itself as an unbroken token, with trailing combining stuff
+#
+$ALPlus $CM+;
+$BA $CM+;
+$BAX $CM+;
+$BB $CM+;
+$B2 $CM+;
+$CL $CM+;
+$CP $CM+;
+$EX $CM+;
+$EXX $CM+;
+$GL $CM+;
+$HL $CM+;
+$HY $CM+;
+$H2 $CM+;
+$H3 $CM+;
+$ID $CM+;
+$IN $CM+;
+$IS $CM+;
+$JL $CM+;
+$JV $CM+;
+$JT $CM+;
+$NS $CM+;
+$NSX $CM+;
+$NU $CM+;
+$OP $CM+;
+$PO $CM+;
+$POX $CM+;
+$PR $CM+;
+$PRX $CM+;
+$QU $CM+;
+$RI $CM+;
+$SY $CM+;
+$WJ $CM+;
+
+#
+# CAN_CM  is the set of characters that may combine with CM combining chars.
+#         Note that Linebreak UAX 14's concept of a combining char and the rules
+#         for what they can combine with are _very_ different from the rest of Unicode.
+#
+#         Note that $CM itself is left out of this set.  If CM is needed as a base
+#         it must be listed separately in the rule.
+#
+$CAN_CM  = [^$SP $BK $CR $LF $NL $ZW $CM];       # Bases that can   take CMs
+$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM];       # Bases that can't take CMs
+
+#
+# AL_FOLLOW  set of chars that can unconditionally follow an AL
+#            Needed in rules where stand-alone $CM s are treated as AL.
+#            Chaining is disabled with CM because it causes other failures,
+#            so for this one case we need to manually list out longer sequences.
+#
+$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
+$AL_FOLLOW_CM   = [$CL $CP $EX $EXX $HL $IS $SY $WJ $GL $OP $QU $BA $BAX $HY $NS $NSX $IN $NU $ALPlus];
+$AL_FOLLOW      = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
+
+
+#
+#  Rule LB 4, 5    Mandatory (Hard) breaks.
+#
+$LB4Breaks    = [$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL];
+$CR $LF {100};
+
+#
+#  LB 6    Do not break before hard line breaks.
+#
+$LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
+$CAN_CM $CM*    $LB4Breaks {100};
+$CM+            $LB4Breaks {100};
+
+# LB 7         x SP
+#              x ZW
+$LB4NonBreaks [$SP $ZW];
+$CAN_CM $CM*  [$SP $ZW];
+$CM+          [$SP $ZW];
+
+#
+# LB 8         Break after zero width space
+#              TODO:  ZW SP* <break>
+#              An engine change is required to write the reverse rule for this.
+#              For now, leave the Unicode 5.2 rule, ZW <break>
+#
+$LB8Breaks    = [$LB4Breaks $ZW];
+$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+
+
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
+# LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL 
+#                                $CM not covered by the above needs to behave like $AL   
+#                                See definition of $CAN_CM.
+
+$CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
+$CM+;
+
+#
+# LB 11  Do not break before or after WORD JOINER & related characters.
+#
+$CAN_CM $CM*  $WJcm;
+$LB8NonBreaks $WJcm;
+$CM+          $WJcm;
+
+$WJcm $CANT_CM;
+$WJcm $CAN_CM $CM*;
+
+#
+# LB 12  Do not break after NBSP and related characters.
+#         GL  x
+#
+$GLcm $CAN_CM $CM*;
+$GLcm $CANT_CM;
+ 
+#
+# LB 12a  Do not break before NBSP and related characters ...
+#            [^SP BA HY] x GL
+#
+[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm;
+$CM+ GLcm;
+
+
+
+#
+# LB 13   Don't break before ']' or '!' or ';' or '/', even after spaces.
+#
+# Do not include $EXX here
+$LB8NonBreaks $CL;
+$CAN_CM $CM*  $CL;
+$CM+          $CL;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $CP;
+$CAN_CM $CM*  $CP;
+$CM+          $CP;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $EX;
+$CAN_CM $CM*  $EX;
+$CM+          $EX;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $IS;
+$CAN_CM $CM*  $IS;
+$CM+          $IS;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $SY;
+$CAN_CM $CM*  $SY;
+$CM+          $SY;              # by rule 10, stand-alone CM behaves as AL
+
+
+#
+# LB 14  Do not break after OP, even after spaces
+#
+$OPcm $SP* $CAN_CM $CM*;
+$OPcm $SP* $CANT_CM;
+
+$OPcm $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
+
+# LB 15
+$QUcm $SP* $OPcm;
+
+# LB 16
+# Do not break between closing punctuation and $NS, even with intervening spaces
+# But DO allow a break between closing punctuation and $NSX, don't include it here
+($CLcm | $CPcm) $SP* $NScm;
+
+# LB 17
+$B2cm $SP* $B2cm;
+
+#
+# LB 18  Break after spaces.
+#
+$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
+$LB18Breaks    = [$LB8Breaks $SP];
+
+
+# LB 19
+#         x QU
+$LB18NonBreaks $CM* $QUcm;
+$CM+                $QUcm;
+
+#         QU  x
+$QUcm .?;
+$QUcm $LB18NonBreaks $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
+                              #  TODO:  I don't think this rule is needed.
+
+
+# LB 20
+#        <break>  $CB
+#        $CB   <break>
+
+$LB20NonBreaks = [$LB18NonBreaks - $CB];
+
+# LB 21        x   (BA | HY | NS)
+#           BB x
+#
+# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
+$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); 
+
+$BBcm [^$CB];                                  #  $BB  x
+$BBcm $LB20NonBreaks $CM*;
+
+# LB 21a Don't break after Hebrew + Hyphen
+#   HL (HY | BA) x
+#  
+$HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?;
+
+# LB 21b (forward) Don't break between SY and HL
+# (break between HL and SY already disallowed by LB 13 above)
+$SYcm $HLcm;
+
+# LB 22
+($ALcm | $HLcm) $INcm;
+$CM+     $INcm;     #  by rule 10, any otherwise unattached CM behaves as AL
+$IDcm    $INcm;
+# $INcm  $INcm; # delete this rule for CSS loose
+$NUcm    $INcm;
+
+
+# LB 23
+# Do not include $POX here
+$IDcm  $POcm;
+$ALcm  $NUcm;       # includes $LB19
+$HLcm  $NUcm;
+$CM+   $NUcm;       # Rule 10, any otherwise unattached CM behaves as AL
+$NUcm  $ALcm;
+$NUcm  $HLcm;
+
+#
+# LB 24
+#
+# Do not include $PRX here
+$PRcm $IDcm;
+$PRcm ($ALcm | $HLcm);
+($POcm | $POXcm) ($ALcm | $HLcm);
+
+#
+# LB 25   Numbers.
+#
+# Here do not include $PRX at the beginning or $POX at the end
+($PRcm | $POcm | $POXcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $PRXcm | $POcm)?;
+
+# LB 26  Do not break a Korean syllable
+#
+$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
+($JVcm | $H2cm) ($JVcm | $JTcm);
+($JTcm | $H3cm) $JTcm;
+
+# LB 27  Treat korean Syllable Block the same as ID  (don't break it)
+# Do not include $POX or $PRX here
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
+$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
+
+
+# LB 28   Do not break between alphabetics
+#
+($ALcm | $HLcm) ($ALcm | $HLcm);
+$CM+ ($ALcm | $HLcm);      # The $CM+ is from rule 10, an unattached CM is treated as AL
+
+# LB 29
+$IScm ($ALcm | $HLcm);
+
+# LB 30
+($ALcm | $HLcm | $NUcm) $OPcm;
+$CM+ $OPcm;         # The $CM+ is from rule 10, an unattached CM is treated as AL.          
+$CPcm ($ALcm | $HLcm | $NUcm);
+
+# LB 30a  Do not break between regional indicators.
+$RIcm $RIcm;
+
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
+#
+#  Reverse Rules.
+#
+## -------------------------------------------------
+
+!!reverse;
+
+$CM+ $ALPlus;
+$CM+ $BA;
+$CM+ $BAX;
+$CM+ $BB;
+$CM+ $B2;
+$CM+ $CL;
+$CM+ $CP;
+$CM+ $EX;
+$CM+ $EXX;
+$CM+ $GL;
+$CM+ $HL;
+$CM+ $HY;
+$CM+ $H2;
+$CM+ $H3;
+$CM+ $ID;
+$CM+ $IN;
+$CM+ $IS;
+$CM+ $JL;
+$CM+ $JV;
+$CM+ $JT;
+$CM+ $NS;
+$CM+ $NSX;
+$CM+ $NU;
+$CM+ $OP;
+$CM+ $PO;
+$CM+ $POX;
+$CM+ $PR;
+$CM+ $PRX;
+$CM+ $QU;
+$CM+ $RI;
+$CM+ $SY;
+$CM+ $WJ;
+$CM+;
+
+
+#
+#  Sequences of the form  (shown forwards)
+#      [CANT_CM]  <break>  [CM]  [whatever]
+#  The CM needs to behave as an AL
+#
+$AL_FOLLOW $CM+ / (
+          [$BK $CR $LF $NL $ZW {eof}] |
+          $SP+ $CM+ $SP |
+          $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}]));   # if LB 14 will match, need to surpress this break.
+                                               #  LB14 says    OP SP* x .        
+                                               #    becomes    OP SP* x AL
+                                               #    becomes    OP SP* x CM+ AL_FOLLOW
+                                               #
+                                               # Further note:  the $AL in [$AL {eof}] is only to work around
+                                               #                a rule compiler bug which complains about
+                                               #                empty sets otherwise.
+          
+#
+#  Sequences of the form  (shown forwards)
+#      [CANT_CM]  <break> [CM]  <break>  [PR]
+#  The CM needs to behave as an AL
+#  This rule is concerned about getting the second of the two <breaks> in place.
+#
+
+[$PR $PRX  ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
+
+
+
+# LB 4, 5, 5
+
+$LB4Breaks [$LB4NonBreaks-$CM];
+$LB4Breaks $CM+ $CAN_CM;
+$LF $CR;
+
+
+# LB 7         x SP
+#              x ZW
+[$SP $ZW] [$LB4NonBreaks-$CM];
+[$SP $ZW] $CM+ $CAN_CM;
+
+# LB 8 ZW SP* <break>
+#     TODO: to implement this, we need more than one look-ahead hard break in play at a time.
+#           Requires an engine enhancement.
+#   / $SP* $ZW
+
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
+# LB 9,10  Combining marks.
+#    X   $CM needs to behave like X, where X is not $SP or controls.
+#    $CM not covered by the above needs to behave like $AL
+# Stick together any combining sequences that don't match other rules.
+$CM+ $CAN_CM;
+
+
+# LB 11
+$CM* $WJ $CM* $CAN_CM;
+$CM* $WJ      [$LB8NonBreaks-$CM];
+
+     $CANT_CM $CM* $WJ;
+$CM* $CAN_CM  $CM* $WJ;
+
+# LB 12a
+#      [^SP BA HY] x GL
+#
+$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
+
+# LB 12
+#     GL  x
+#
+$CANT_CM $CM* $GL;
+$CM* $CAN_CM $CM* $GL;
+
+
+# LB 13
+# Do not include $EXX here
+$CL $CM+ $CAN_CM;
+$CP $CM+ $CAN_CM;
+$EX $CM+ $CAN_CM;
+$IS $CM+ $CAN_CM;
+$SY $CM+ $CAN_CM;
+
+$CL [$LB8NonBreaks-$CM];
+$CP [$LB8NonBreaks-$CM];
+$EX [$LB8NonBreaks-$CM];
+$IS [$LB8NonBreaks-$CM];
+$SY [$LB8NonBreaks-$CM];
+
+# Rule 13 & 14 taken together for an edge case.
+#   Match this, shown forward
+#     OP SP+  ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
+#   This really wants to chain at the $CM+ (which is acting as an $AL)
+#   except for $CM chaining being disabled.
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;  
+
+# LB 14    OP SP* x
+#
+$CM* $CAN_CM    $SP* $CM* $OP;
+     $CANT_CM   $SP* $CM* $OP;
+$AL_FOLLOW? $CM+  $SP $SP* $CM* $OP;     #  by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
+     
+     $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$CM* $AL_FOLLOW_CM   $CM+ $SP+ $CM* $OP;
+$SY $CM $SP+ $OP;   # TODO:  Experiment.  Remove.
+
+
+
+# LB 15
+$CM* $OP $SP* $CM* $QU;
+
+# LB 16
+# Don't include $NSX here
+$CM* $NS $SP* $CM* ($CL | $CP);
+
+# LB 17
+$CM* $B2 $SP* $CM* $B2;
+
+# LB 18  break after spaces
+#        Nothing explicit needed here.
+
+
+#
+# LB 19
+#
+$CM* $QU $CM* $CAN_CM;                                #   . x QU
+$CM* $QU      $LB18NonBreaks;
+
+
+$CM* $CAN_CM  $CM* $QU;                               #   QU x .
+     $CANT_CM $CM* $QU;
+     
+#
+#  LB 20  Break before and after CB.
+#         nothing needed here.
+#
+
+# LB 21
+# Don't include $BAX or $NSX here
+$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM];     #  . x (BA | HY | NS)
+
+$CM* [$LB20NonBreaks-$CM] $CM* $BB;                   #  BB x .
+[^$CB] $CM* $BB;                                      # 
+
+# LB21a
+[^$CB] $CM* ($HY | $BA | $BAX) $CM* $HL;
+
+# LB21b (reverse)
+$CM* $HL $CM* $SY;
+
+# LB 22
+$CM* $IN $CM* ($ALPlus | $HL);
+$CM* $IN $CM* $ID;
+# $CM* $IN $CM* $IN; # delete this rule for CSS loose
+$CM* $IN $CM* $NU;
+
+# LB 23
+# Do not include $POX here
+$CM* $PO $CM* $ID;
+$CM* $NU $CM* ($ALPlus | $HL);
+$CM* ($ALPlus | $HL) $CM* $NU;
+
+# LB 24
+# Do not include $PRX here
+$CM* $ID $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* ($PO | $POX);
+
+
+# LB 25
+# Here do not include $POX at the beginning or $PRX at the end
+($CM* ($PR | $PRX | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO | $POX))?;
+
+# LB 26
+$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
+$CM* ($JT | $JV) $CM* ($H2 | $JV);
+$CM* $JT $CM* ($H3 | $JT);
+
+# LB 27
+# Do not include $POX or $PRX here
+$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+
+# LB 28
+$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+
+
+# LB 29
+$CM* ($ALPlus | $HL) $CM* $IS;
+
+# LB 30
+$CM* $OP $CM* ($ALPlus | $HL | $NU);
+$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+
+# LB 30a
+$CM* $RI $CM* $RI;
+
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# LB 9
+$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+$CM+ $SP / .;
+
+# LB 14
+$SP+ $CM* $OP;
+
+# LB 15
+$SP+ $CM* $QU;
+
+# LB 16
+$SP+ $CM* ($CL | $CP);
+
+# LB 17
+$SP+ $CM* $B2;
+
+# LB 21
+$CM* ($HY | $BA) $CM* $HL;
+
+# LB 25
+($CM* ($IS | $SY))+ $CM* $NU;
+($CL | $CP) $CM* ($NU | $IS | $SY);
+
+# For dictionary-based break
+$dictionary $dictionary;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# Skip forward over all character classes that are involved in
+#   rules containing patterns with possibly more than one char
+#   of context.
+#
+#  It might be slightly more efficient to have specific rules
+#  instead of one generic one, but only if we could
+#  turn off rule chaining.  We don't want to move more
+#  than necessary.
+#
+[$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $dictionary];
+$dictionary $dictionary;
+
diff --git a/icuSources/data/brkitr/line_loose_fi.txt b/icuSources/data/brkitr/line_loose_fi.txt

new file mode 100644 (file)

index 0000000..94e2c4d
--- /dev/null
+++ b/icuSources/data/brkitr/line_loose_fi.txt
@@ -0,0 +1,725 @@
+# Copyright (c) 2002-2015  International Business Machines Corporation and
+# others. All Rights Reserved.
+#
+#  file:  line_loose_fi.txt
+#
+#         Line Breaking Rules
+#         Implement default line breaking as defined by 
+#         Unicode Standard Annex #14 Revision 29 for Unicode 6.2
+#         http://www.unicode.org/reports/tr14/
+#         tailored as noted in 2nd paragraph below..
+#
+#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
+#         This is only because of a limitation of ICU break engine implementation,
+#         not because the older behavior is desirable.
+#
+#         This tailors the line break behavior both for Finnish and to correpond to CSS
+#         line-break=loose (BCP47 -u-lb-loose) as defined for languages other than 
+#         Chinese & Japanese.
+#         It sets characters of class CJ to behave like ID.
+#         In addition, it allows breaks before 3005, 303B, 309D, 309E, 30FD, 30FE (all NS).
+
+#
+#  Character Classes defined by TR 14.
+#
+
+!!chain;
+!!LBCMNoChain;
+
+
+!!lookAheadHardBreak;
+#
+#  !!lookAheadHardBreak    Described here because it is (as yet) undocumented elsewhere
+#                          and only used for the line break rules.
+#
+#           It is used in the implementation of rule LB 10
+#           which says to treat any combining mark that is not attached to a base
+#           character as if it were of class AL  (alphabetic).
+#
+#           The problem occurs in the reverse rules.
+#
+#           Consider a sequence like, with correct breaks as shown
+#               LF  ID  CM  AL  AL
+#                  ^       ^       ^
+#           Then consider the sequence without the initial ID (ideographic)
+#                 LF  CM  AL  AL
+#                    ^           ^
+#           Our CM, which in the first example was attached to the ideograph,
+#           is now unattached, becomes an alpha, and joins in with the other
+#           alphas.
+#
+#           When iterating forwards, these sequences do not present any problems
+#           When iterating backwards, we need to look ahead when encountering
+#           a CM to see whether it attaches to something further on or not.
+#           (Look-ahead in a reverse rule is looking towards the start)
+#
+#           If the CM is unattached, we need to force a break.
+#
+#           !!lookAheadHardBreak forces the run time state machine to
+#           stop immediately when a look ahead rule ( '/' operator) matches,
+#           and set the match position to that of the look-ahead operator,
+#           no matter what other rules may be in play at the time.
+#
+#           See rule LB 19 for an example.
+#
+
+$AI = [:LineBreak =  Ambiguous:];
+$AL = [:LineBreak =  Alphabetic:];
+$BA = [[:LineBreak =  Break_After:] - [\u2010]];
+$HH = [\u2010];
+$BB = [:LineBreak =  Break_Before:];
+$BK = [:LineBreak =  Mandatory_Break:];
+$B2 = [:LineBreak =  Break_Both:];
+$CB = [:LineBreak =  Contingent_Break:];
+$CJ = [:LineBreak =  Conditional_Japanese_Starter:];
+$CL = [:LineBreak =  Close_Punctuation:];
+$CM = [:LineBreak =  Combining_Mark:];
+$CP = [:LineBreak =  Close_Parenthesis:];
+$CR = [:LineBreak =  Carriage_Return:];
+$EX = [:LineBreak =  Exclamation:];
+$GL = [:LineBreak =  Glue:];
+$HL = [:LineBreak =  Hebrew_Letter:];
+$HY = [:LineBreak =  Hyphen:];
+$H2 = [:LineBreak =  H2:];
+$H3 = [:LineBreak =  H3:];
+$ID = [[:LineBreak =  Ideographic:] $CJ];
+$IN = [:LineBreak =  Inseperable:];
+$IS = [:LineBreak =  Infix_Numeric:];
+$JL = [:LineBreak =  JL:];
+$JV = [:LineBreak =  JV:];
+$JT = [:LineBreak =  JT:];
+$LF = [:LineBreak =  Line_Feed:];
+$NL = [:LineBreak =  Next_Line:];
+$NSX = [\u3005 \u303B \u309D \u309E \u30FD \u30FE];
+$NS = [[:LineBreak =  Nonstarter:] - $NSX];
+$NU = [:LineBreak =  Numeric:];
+$OP = [:LineBreak =  Open_Punctuation:];
+$PO = [:LineBreak =  Postfix_Numeric:];
+$PR = [:LineBreak =  Prefix_Numeric:];
+$QU = [:LineBreak =  Quotation:];
+$RI = [:LineBreak =  Regional_Indicator:];
+$SA = [:LineBreak =  Complex_Context:];
+$SG = [:LineBreak =  Surrogate:];
+$SP = [:LineBreak =  Space:];
+$SY = [:LineBreak =  Break_Symbols:];
+$WJ = [:LineBreak =  Word_Joiner:];
+$XX = [:LineBreak =  Unknown:];
+$ZW = [:LineBreak =  ZWSpace:];
+
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
+#   Dictionary character set, for triggering language-based break engines. Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include all
+#   characters requiring dictionary break.
+
+$dictionary = [:LineBreak = Complex_Context:];
+
+#
+#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
+#                               SA  (South East Asian: Thai, Lao, Khmer)
+#                               SG  (Unpaired Surrogates)
+#                               XX  (Unknown, unassigned)
+#                         as $AL  (Alphabetic)
+#
+$ALPlus = [$AL $AI $SA $SG $XX];
+
+#
+#  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
+#
+$ALcm = $ALPlus $CM*;
+$BAcm = $BA $CM*;
+$HHcm = $HH $CM*;
+$BBcm = $BB $CM*;
+$B2cm = $B2 $CM*;
+$CLcm = $CL $CM*;
+$CPcm = $CP $CM*;
+$EXcm = $EX $CM*;
+$GLcm = $GL $CM*;
+$HLcm = $HL $CM*;
+$HYcm = $HY $CM*;
+$H2cm = $H2 $CM*;
+$H3cm = $H3 $CM*;
+$IDcm = $ID $CM*;
+$INcm = $IN $CM*;
+$IScm = $IS $CM*;
+$JLcm = $JL $CM*;
+$JVcm = $JV $CM*;
+$JTcm = $JT $CM*;
+$NScm = $NS $CM*;
+$NSXcm = $NSX $CM*;
+$NUcm = $NU $CM*;
+$OPcm = $OP $CM*;
+$POcm = $PO $CM*;
+$PRcm = $PR $CM*;
+$QUcm = $QU $CM*;
+$RIcm = $RI $CM*;
+$SYcm = $SY $CM*;
+$WJcm = $WJ $CM*;
+
+## -------------------------------------------------
+
+!!forward;
+
+#
+#  Each class of character can stand by itself as an unbroken token, with trailing combining stuff
+#
+$ALPlus $CM+;
+$BA $CM+;
+$HH $CM+;
+$BB $CM+;
+$B2 $CM+;
+$CL $CM+;
+$CP $CM+;
+$EX $CM+;
+$GL $CM+;
+$HL $CM+;
+$HY $CM+;
+$H2 $CM+;
+$H3 $CM+;
+$ID $CM+;
+$IN $CM+;
+$IS $CM+;
+$JL $CM+;
+$JV $CM+;
+$JT $CM+;
+$NS $CM+;
+$NSX $CM+;
+$NU $CM+;
+$OP $CM+;
+$PO $CM+;
+$PR $CM+;
+$QU $CM+;
+$RI $CM+;
+$SY $CM+;
+$WJ $CM+;
+
+#
+# CAN_CM  is the set of characters that may combine with CM combining chars.
+#         Note that Linebreak UAX 14's concept of a combining char and the rules
+#         for what they can combine with are _very_ different from the rest of Unicode.
+#
+#         Note that $CM itself is left out of this set.  If CM is needed as a base
+#         it must be listed separately in the rule.
+#
+$CAN_CM  = [^$SP $BK $CR $LF $NL $ZW $CM];       # Bases that can   take CMs
+$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM];       # Bases that can't take CMs
+
+#
+# AL_FOLLOW  set of chars that can unconditionally follow an AL
+#            Needed in rules where stand-alone $CM s are treated as AL.
+#            Chaining is disabled with CM because it causes other failures,
+#            so for this one case we need to manually list out longer sequences.
+#
+$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
+$AL_FOLLOW_CM   = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $NSX $IN $NU $ALPlus];
+$AL_FOLLOW      = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
+
+
+#
+#  Rule LB 4, 5    Mandatory (Hard) breaks.
+#
+$LB4Breaks    = [$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL];
+$CR $LF {100};
+
+#
+#  LB 6    Do not break before hard line breaks.
+#
+$LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
+$CAN_CM $CM*    $LB4Breaks {100};
+$CM+            $LB4Breaks {100};
+
+# LB 7         x SP
+#              x ZW
+$LB4NonBreaks [$SP $ZW];
+$CAN_CM $CM*  [$SP $ZW];
+$CM+          [$SP $ZW];
+
+#
+# LB 8         Break after zero width space
+#              TODO:  ZW SP* <break>
+#              An engine change is required to write the reverse rule for this.
+#              For now, leave the Unicode 5.2 rule, ZW <break>
+#
+$LB8Breaks    = [$LB4Breaks $ZW];
+$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+
+
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
+# LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL 
+#                                $CM not covered by the above needs to behave like $AL   
+#                                See definition of $CAN_CM.
+
+$CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
+$CM+;
+
+#
+# LB 11  Do not break before or after WORD JOINER & related characters.
+#
+$CAN_CM $CM*  $WJcm;
+$LB8NonBreaks $WJcm;
+$CM+          $WJcm;
+
+$WJcm $CANT_CM;
+$WJcm $CAN_CM $CM*;
+
+#
+# LB 12  Do not break after NBSP and related characters.
+#         GL  x
+#
+$GLcm $CAN_CM $CM*;
+$GLcm $CANT_CM;
+ 
+#
+# LB 12a  Do not break before NBSP and related characters ...
+#            [^SP BA HY] x GL
+#
+[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm;
+$CM+ GLcm;
+
+
+
+#
+# LB 13   Don't break before ']' or '!' or ';' or '/', even after spaces.
+#
+$LB8NonBreaks $CL;
+$CAN_CM $CM*  $CL;
+$CM+          $CL;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $CP;
+$CAN_CM $CM*  $CP;
+$CM+          $CP;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $EX;
+$CAN_CM $CM*  $EX;
+$CM+          $EX;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $IS;
+$CAN_CM $CM*  $IS;
+$CM+          $IS;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $SY;
+$CAN_CM $CM*  $SY;
+$CM+          $SY;              # by rule 10, stand-alone CM behaves as AL
+
+
+#
+# LB 14  Do not break after OP, even after spaces
+#
+$OPcm $SP* $CAN_CM $CM*;
+$OPcm $SP* $CANT_CM;
+
+$OPcm $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
+
+# LB 15
+$QUcm $SP* $OPcm;
+
+# LB 16
+# Do not break between closing punctuation and $NS, even with intervening spaces
+# But DO allow a break between closing punctuation and $NSX, don't include it here
+($CLcm | $CPcm) $SP* $NScm;
+
+# LB 17
+$B2cm $SP* $B2cm;
+
+#
+# LB 18  Break after spaces.
+#
+$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
+$LB18Breaks    = [$LB8Breaks $SP];
+
+
+# LB 19
+#         x QU
+$LB18NonBreaks $CM* $QUcm;
+$CM+                $QUcm;
+
+#         QU  x
+$QUcm .?;
+$QUcm $LB18NonBreaks $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
+                              #  TODO:  I don't think this rule is needed.
+
+
+# LB 20
+#        <break>  $CB
+#        $CB   <break>
+
+$LB20NonBreaks = [$LB18NonBreaks - $CB];
+
+# LB 20.09 added rule for Finnish tailoring
+# LB 21        x   (BA | HY | NS)
+#           BB x
+#
+# DO allow breaks here before NSXcm, so don't include it
+$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL; 
+$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm); 
+($HY | $HH) $AL;
+
+$BBcm [^$CB];                                  #  $BB  x
+$BBcm $LB20NonBreaks $CM*;
+
+# LB 21a Don't break after Hebrew + Hyphen
+#   HL (HY | BA) x
+#  
+$HLcm ($HYcm | $BAcm | $HHcm) [^$CB]?;
+
+# LB 21b (forward) Don't break between SY and HL
+# (break between HL and SY already disallowed by LB 13 above)
+$SYcm $HLcm;
+
+# LB 22
+($ALcm | $HLcm) $INcm;
+$CM+     $INcm;     #  by rule 10, any otherwise unattached CM behaves as AL
+$IDcm    $INcm;
+$INcm    $INcm;
+$NUcm    $INcm;
+
+
+# $LB 23
+$IDcm  $POcm;
+$ALcm  $NUcm;       # includes $LB19
+$HLcm  $NUcm;
+$CM+   $NUcm;       # Rule 10, any otherwise unattached CM behaves as AL
+$NUcm  $ALcm;
+$NUcm  $HLcm;
+
+#
+# LB 24
+#
+$PRcm $IDcm;
+$PRcm ($ALcm | $HLcm);
+$POcm ($ALcm | $HLcm);
+
+#
+# LB 25   Numbers.
+#
+($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
+
+# LB 26  Do not break a Korean syllable
+#
+$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
+($JVcm | $H2cm) ($JVcm | $JTcm);
+($JTcm | $H3cm) $JTcm;
+
+# LB 27  Treat korean Syllable Block the same as ID  (don't break it)
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
+$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
+
+
+# LB 28   Do not break between alphabetics
+#
+($ALcm | $HLcm) ($ALcm | $HLcm);
+$CM+ ($ALcm | $HLcm);      # The $CM+ is from rule 10, an unattached CM is treated as AL
+
+# LB 29
+$IScm ($ALcm | $HLcm);
+
+# LB 30
+($ALcm | $HLcm | $NUcm) $OPcm;
+$CM+ $OPcm;         # The $CM+ is from rule 10, an unattached CM is treated as AL.          
+$CPcm ($ALcm | $HLcm | $NUcm);
+
+# LB 30a  Do not break between regional indicators.
+$RIcm $RIcm;
+
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
+#
+#  Reverse Rules.
+#
+## -------------------------------------------------
+
+!!reverse;
+
+$CM+ $ALPlus;
+$CM+ $BA;
+$CM+ $HH;
+$CM+ $BB;
+$CM+ $B2;
+$CM+ $CL;
+$CM+ $CP;
+$CM+ $EX;
+$CM+ $GL;
+$CM+ $HL;
+$CM+ $HY;
+$CM+ $H2;
+$CM+ $H3;
+$CM+ $ID;
+$CM+ $IN;
+$CM+ $IS;
+$CM+ $JL;
+$CM+ $JV;
+$CM+ $JT;
+$CM+ $NS;
+$CM+ $NSX;
+$CM+ $NU;
+$CM+ $OP;
+$CM+ $PO;
+$CM+ $PR;
+$CM+ $QU;
+$CM+ $RI;
+$CM+ $SY;
+$CM+ $WJ;
+$CM+;
+
+
+#
+#  Sequences of the form  (shown forwards)
+#      [CANT_CM]  <break>  [CM]  [whatever]
+#  The CM needs to behave as an AL
+#
+$AL_FOLLOW $CM+ / (
+          [$BK $CR $LF $NL $ZW {eof}] |
+          $SP+ $CM+ $SP |
+          $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}]));   # if LB 14 will match, need to surpress this break.
+                                               #  LB14 says    OP SP* x .        
+                                               #    becomes    OP SP* x AL
+                                               #    becomes    OP SP* x CM+ AL_FOLLOW
+                                               #
+                                               # Further note:  the $AL in [$AL {eof}] is only to work around
+                                               #                a rule compiler bug which complains about
+                                               #                empty sets otherwise.
+          
+#
+#  Sequences of the form  (shown forwards)
+#      [CANT_CM]  <break> [CM]  <break>  [PR]
+#  The CM needs to behave as an AL
+#  This rule is concerned about getting the second of the two <breaks> in place.
+#
+
+[$PR   ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
+
+
+
+# LB 4, 5, 5
+
+$LB4Breaks [$LB4NonBreaks-$CM];
+$LB4Breaks $CM+ $CAN_CM;
+$LF $CR;
+
+
+# LB 7         x SP
+#              x ZW
+[$SP $ZW] [$LB4NonBreaks-$CM];
+[$SP $ZW] $CM+ $CAN_CM;
+
+# LB 8 ZW SP* <break>
+#     TODO: to implement this, we need more than one look-ahead hard break in play at a time.
+#           Requires an engine enhancement.
+#   / $SP* $ZW
+
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
+# LB 9,10  Combining marks.
+#    X   $CM needs to behave like X, where X is not $SP or controls.
+#    $CM not covered by the above needs to behave like $AL
+# Stick together any combining sequences that don't match other rules.
+$CM+ $CAN_CM;
+
+
+# LB 11
+$CM* $WJ $CM* $CAN_CM;
+$CM* $WJ      [$LB8NonBreaks-$CM];
+
+     $CANT_CM $CM* $WJ;
+$CM* $CAN_CM  $CM* $WJ;
+
+# LB 12a
+#      [^SP BA HY] x GL
+#
+$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
+
+# LB 12
+#     GL  x
+#
+$CANT_CM $CM* $GL;
+$CM* $CAN_CM $CM* $GL;
+
+
+# LB 13
+$CL $CM+ $CAN_CM;
+$CP $CM+ $CAN_CM;
+$EX $CM+ $CAN_CM;
+$IS $CM+ $CAN_CM;
+$SY $CM+ $CAN_CM;
+
+$CL [$LB8NonBreaks-$CM];
+$CP [$LB8NonBreaks-$CM];
+$EX [$LB8NonBreaks-$CM];
+$IS [$LB8NonBreaks-$CM];
+$SY [$LB8NonBreaks-$CM];
+
+# Rule 13 & 14 taken together for an edge case.
+#   Match this, shown forward
+#     OP SP+  ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
+#   This really wants to chain at the $CM+ (which is acting as an $AL)
+#   except for $CM chaining being disabled.
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;  
+
+# LB 14    OP SP* x
+#
+$CM* $CAN_CM    $SP* $CM* $OP;
+     $CANT_CM   $SP* $CM* $OP;
+$AL_FOLLOW? $CM+  $SP $SP* $CM* $OP;     #  by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
+     
+     $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$CM* $AL_FOLLOW_CM   $CM+ $SP+ $CM* $OP;
+$SY $CM $SP+ $OP;   # TODO:  Experiment.  Remove.
+
+
+
+# LB 15
+$CM* $OP $SP* $CM* $QU;
+
+# LB 16
+# Don't include $NSX here
+$CM* $NS $SP* $CM* ($CL | $CP);
+
+# LB 17
+$CM* $B2 $SP* $CM* $B2;
+
+# LB 18  break after spaces
+#        Nothing explicit needed here.
+
+
+#
+# LB 19
+#
+$CM* $QU $CM* $CAN_CM;                                #   . x QU
+$CM* $QU      $LB18NonBreaks;
+
+
+$CM* $CAN_CM  $CM* $QU;                               #   QU x .
+     $CANT_CM $CM* $QU;
+     
+#
+#  LB 20  Break before and after CB.
+#         nothing needed here.
+#
+
+# LB 20.09 added rule for Finnish tailoring
+$AL ($HY | $HH) / $SP;
+
+# LB 21
+# Don't include $NSX here
+$CM* ($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM];     #  . x (BA | HY | NS)
+
+$CM* [$LB20NonBreaks-$CM] $CM* $BB;                   #  BB x .
+[^$CB] $CM* $BB;                                      # 
+
+# LB21a
+[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL;
+
+# LB21b (reverse)
+$CM* $HL $CM* $SY;
+
+# LB 22
+$CM* $IN $CM* ($ALPlus | $HL);
+$CM* $IN $CM* $ID;
+$CM* $IN $CM* $IN;
+$CM* $IN $CM* $NU;
+
+# LB 23
+$CM* $PO $CM* $ID;
+$CM* $NU $CM* ($ALPlus | $HL);
+$CM* ($ALPlus | $HL) $CM* $NU;
+
+# LB 24
+$CM* $ID $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PO;
+
+
+# LB 25
+($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
+
+# LB 26
+$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
+$CM* ($JT | $JV) $CM* ($H2 | $JV);
+$CM* $JT $CM* ($H3 | $JT);
+
+# LB 27
+$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+
+# LB 28
+$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+
+
+# LB 29
+$CM* ($ALPlus | $HL) $CM* $IS;
+
+# LB 30
+$CM* $OP $CM* ($ALPlus | $HL | $NU);
+$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+
+# LB 30a
+$CM* $RI $CM* $RI;
+
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# LB 9
+$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+$CM+ $SP / .;
+
+# LB 14
+$SP+ $CM* $OP;
+
+# LB 15
+$SP+ $CM* $QU;
+
+# LB 16
+$SP+ $CM* ($CL | $CP);
+
+# LB 17
+$SP+ $CM* $B2;
+
+# LB 21
+$CM* ($HY | $BA | $HH) $CM* $HL;
+
+# LB 25
+($CM* ($IS | $SY))+ $CM* $NU;
+($CL | $CP) $CM* ($NU | $IS | $SY);
+
+# For dictionary-based break
+$dictionary $dictionary;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# Skip forward over all character classes that are involved in
+#   rules containing patterns with possibly more than one char
+#   of context.
+#
+#  It might be slightly more efficient to have specific rules
+#  instead of one generic one, but only if we could
+#  turn off rule chaining.  We don't want to move more
+#  than necessary.
+#
+[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
+$dictionary $dictionary;
+
diff --git a/icuSources/data/brkitr/line_normal.txt b/icuSources/data/brkitr/line_normal.txt

new file mode 100644 (file)

index 0000000..1c637f2
--- /dev/null
+++ b/icuSources/data/brkitr/line_normal.txt
@@ -0,0 +1,705 @@
+# Copyright (c) 2002-2015  International Business Machines Corporation and
+# others. All Rights Reserved.
+#
+#  file:  line_normal.txt
+#
+#         Line Breaking Rules
+#         Implement default line breaking as defined by 
+#         Unicode Standard Annex #14 Revision 29 for Unicode 6.2
+#         http://www.unicode.org/reports/tr14/
+#         tailored as noted in 2nd paragraph below..
+#
+#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
+#         This is only because of a limitation of ICU break engine implementation,
+#         not because the older behavior is desirable.
+#
+#         This tailors the line break behavior to correspond to CSS
+#         line-break=normal (BCP47 -u-lb-normal) as defined for languages other than 
+#         Chinese & Japanese.
+#         It sets characters of class CJ to behave like ID.
+
+#
+#  Character Classes defined by TR 14.
+#
+
+!!chain;
+!!LBCMNoChain;
+
+
+!!lookAheadHardBreak;
+#
+#  !!lookAheadHardBreak    Described here because it is (as yet) undocumented elsewhere
+#                          and only used for the line break rules.
+#
+#           It is used in the implementation of rule LB 10
+#           which says to treat any combining mark that is not attached to a base
+#           character as if it were of class AL  (alphabetic).
+#
+#           The problem occurs in the reverse rules.
+#
+#           Consider a sequence like, with correct breaks as shown
+#               LF  ID  CM  AL  AL
+#                  ^       ^       ^
+#           Then consider the sequence without the initial ID (ideographic)
+#                 LF  CM  AL  AL
+#                    ^           ^
+#           Our CM, which in the first example was attached to the ideograph,
+#           is now unattached, becomes an alpha, and joins in with the other
+#           alphas.
+#
+#           When iterating forwards, these sequences do not present any problems
+#           When iterating backwards, we need to look ahead when encountering
+#           a CM to see whether it attaches to something further on or not.
+#           (Look-ahead in a reverse rule is looking towards the start)
+#
+#           If the CM is unattached, we need to force a break.
+#
+#           !!lookAheadHardBreak forces the run time state machine to
+#           stop immediately when a look ahead rule ( '/' operator) matches,
+#           and set the match position to that of the look-ahead operator,
+#           no matter what other rules may be in play at the time.
+#
+#           See rule LB 19 for an example.
+#
+
+$AI = [:LineBreak =  Ambiguous:];
+$AL = [:LineBreak =  Alphabetic:];
+$BA = [:LineBreak =  Break_After:];
+$BB = [:LineBreak =  Break_Before:];
+$BK = [:LineBreak =  Mandatory_Break:];
+$B2 = [:LineBreak =  Break_Both:];
+$CB = [:LineBreak =  Contingent_Break:];
+$CJ = [:LineBreak =  Conditional_Japanese_Starter:];
+$CL = [:LineBreak =  Close_Punctuation:];
+$CM = [:LineBreak =  Combining_Mark:];
+$CP = [:LineBreak =  Close_Parenthesis:];
+$CR = [:LineBreak =  Carriage_Return:];
+$EX = [:LineBreak =  Exclamation:];
+$GL = [:LineBreak =  Glue:];
+$HL = [:LineBreak =  Hebrew_Letter:];
+$HY = [:LineBreak =  Hyphen:];
+$H2 = [:LineBreak =  H2:];
+$H3 = [:LineBreak =  H3:];
+$ID = [[:LineBreak =  Ideographic:] $CJ];
+$IN = [:LineBreak =  Inseperable:];
+$IS = [:LineBreak =  Infix_Numeric:];
+$JL = [:LineBreak =  JL:];
+$JV = [:LineBreak =  JV:];
+$JT = [:LineBreak =  JT:];
+$LF = [:LineBreak =  Line_Feed:];
+$NL = [:LineBreak =  Next_Line:];
+$NS = [:LineBreak =  Nonstarter:];
+$NU = [:LineBreak =  Numeric:];
+$OP = [:LineBreak =  Open_Punctuation:];
+$PO = [:LineBreak =  Postfix_Numeric:];
+$PR = [:LineBreak =  Prefix_Numeric:];
+$QU = [:LineBreak =  Quotation:];
+$RI = [:LineBreak =  Regional_Indicator:];
+$SA = [:LineBreak =  Complex_Context:];
+$SG = [:LineBreak =  Surrogate:];
+$SP = [:LineBreak =  Space:];
+$SY = [:LineBreak =  Break_Symbols:];
+$WJ = [:LineBreak =  Word_Joiner:];
+$XX = [:LineBreak =  Unknown:];
+$ZW = [:LineBreak =  ZWSpace:];
+
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
+#   Dictionary character set, for triggering language-based break engines. Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include all
+#   characters requiring dictionary break.
+
+$dictionary = [:LineBreak = Complex_Context:];
+
+#
+#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
+#                               SA  (South East Asian: Thai, Lao, Khmer)
+#                               SG  (Unpaired Surrogates)
+#                               XX  (Unknown, unassigned)
+#                         as $AL  (Alphabetic)
+#
+$ALPlus = [$AL $AI $SA $SG $XX];
+
+#
+#  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
+#
+$ALcm = $ALPlus $CM*;
+$BAcm = $BA $CM*;
+$BBcm = $BB $CM*;
+$B2cm = $B2 $CM*;
+$CLcm = $CL $CM*;
+$CPcm = $CP $CM*;
+$EXcm = $EX $CM*;
+$GLcm = $GL $CM*;
+$HLcm = $HL $CM*;
+$HYcm = $HY $CM*;
+$H2cm = $H2 $CM*;
+$H3cm = $H3 $CM*;
+$IDcm = $ID $CM*;
+$INcm = $IN $CM*;
+$IScm = $IS $CM*;
+$JLcm = $JL $CM*;
+$JVcm = $JV $CM*;
+$JTcm = $JT $CM*;
+$NScm = $NS $CM*;
+$NUcm = $NU $CM*;
+$OPcm = $OP $CM*;
+$POcm = $PO $CM*;
+$PRcm = $PR $CM*;
+$QUcm = $QU $CM*;
+$RIcm = $RI $CM*;
+$SYcm = $SY $CM*;
+$WJcm = $WJ $CM*;
+
+## -------------------------------------------------
+
+!!forward;
+
+#
+#  Each class of character can stand by itself as an unbroken token, with trailing combining stuff
+#
+$ALPlus $CM+;
+$BA $CM+;
+$BB $CM+;
+$B2 $CM+;
+$CL $CM+;
+$CP $CM+;
+$EX $CM+;
+$GL $CM+;
+$HL $CM+;
+$HY $CM+;
+$H2 $CM+;
+$H3 $CM+;
+$ID $CM+;
+$IN $CM+;
+$IS $CM+;
+$JL $CM+;
+$JV $CM+;
+$JT $CM+;
+$NS $CM+;
+$NU $CM+;
+$OP $CM+;
+$PO $CM+;
+$PR $CM+;
+$QU $CM+;
+$RI $CM+;
+$SY $CM+;
+$WJ $CM+;
+
+#
+# CAN_CM  is the set of characters that may combine with CM combining chars.
+#         Note that Linebreak UAX 14's concept of a combining char and the rules
+#         for what they can combine with are _very_ different from the rest of Unicode.
+#
+#         Note that $CM itself is left out of this set.  If CM is needed as a base
+#         it must be listed separately in the rule.
+#
+$CAN_CM  = [^$SP $BK $CR $LF $NL $ZW $CM];       # Bases that can   take CMs
+$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM];       # Bases that can't take CMs
+
+#
+# AL_FOLLOW  set of chars that can unconditionally follow an AL
+#            Needed in rules where stand-alone $CM s are treated as AL.
+#            Chaining is disabled with CM because it causes other failures,
+#            so for this one case we need to manually list out longer sequences.
+#
+$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
+$AL_FOLLOW_CM   = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $ALPlus];
+$AL_FOLLOW      = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
+
+
+#
+#  Rule LB 4, 5    Mandatory (Hard) breaks.
+#
+$LB4Breaks    = [$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL];
+$CR $LF {100};
+
+#
+#  LB 6    Do not break before hard line breaks.
+#
+$LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
+$CAN_CM $CM*    $LB4Breaks {100};
+$CM+            $LB4Breaks {100};
+
+# LB 7         x SP
+#              x ZW
+$LB4NonBreaks [$SP $ZW];
+$CAN_CM $CM*  [$SP $ZW];
+$CM+          [$SP $ZW];
+
+#
+# LB 8         Break after zero width space
+#              TODO:  ZW SP* <break>
+#              An engine change is required to write the reverse rule for this.
+#              For now, leave the Unicode 5.2 rule, ZW <break>
+#
+$LB8Breaks    = [$LB4Breaks $ZW];
+$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+
+
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
+# LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL 
+#                                $CM not covered by the above needs to behave like $AL   
+#                                See definition of $CAN_CM.
+
+$CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
+$CM+;
+
+#
+# LB 11  Do not break before or after WORD JOINER & related characters.
+#
+$CAN_CM $CM*  $WJcm;
+$LB8NonBreaks $WJcm;
+$CM+          $WJcm;
+
+$WJcm $CANT_CM;
+$WJcm $CAN_CM $CM*;
+
+#
+# LB 12  Do not break after NBSP and related characters.
+#         GL  x
+#
+$GLcm $CAN_CM $CM*;
+$GLcm $CANT_CM;
+ 
+#
+# LB 12a  Do not break before NBSP and related characters ...
+#            [^SP BA HY] x GL
+#
+[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
+$CM+ GLcm;
+
+
+
+#
+# LB 13   Don't break before ']' or '!' or ';' or '/', even after spaces.
+#
+$LB8NonBreaks $CL;
+$CAN_CM $CM*  $CL;
+$CM+          $CL;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $CP;
+$CAN_CM $CM*  $CP;
+$CM+          $CP;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $EX;
+$CAN_CM $CM*  $EX;
+$CM+          $EX;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $IS;
+$CAN_CM $CM*  $IS;
+$CM+          $IS;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $SY;
+$CAN_CM $CM*  $SY;
+$CM+          $SY;              # by rule 10, stand-alone CM behaves as AL
+
+
+#
+# LB 14  Do not break after OP, even after spaces
+#
+$OPcm $SP* $CAN_CM $CM*;
+$OPcm $SP* $CANT_CM;
+
+$OPcm $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
+
+# LB 15
+$QUcm $SP* $OPcm;
+
+# LB 16
+($CLcm | $CPcm) $SP* $NScm;
+
+# LB 17
+$B2cm $SP* $B2cm;
+
+#
+# LB 18  Break after spaces.
+#
+$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
+$LB18Breaks    = [$LB8Breaks $SP];
+
+
+# LB 19
+#         x QU
+$LB18NonBreaks $CM* $QUcm;
+$CM+                $QUcm;
+
+#         QU  x
+$QUcm .?;
+$QUcm $LB18NonBreaks $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
+                              #  TODO:  I don't think this rule is needed.
+
+
+# LB 20
+#        <break>  $CB
+#        $CB   <break>
+
+$LB20NonBreaks = [$LB18NonBreaks - $CB];
+
+# LB 21        x   (BA | HY | NS)
+#           BB x
+#
+$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); 
+
+$BBcm [^$CB];                                  #  $BB  x
+$BBcm $LB20NonBreaks $CM*;
+
+# LB 21a Don't break after Hebrew + Hyphen
+#   HL (HY | BA) x
+#  
+$HLcm ($HYcm | $BAcm) [^$CB]?;
+
+# LB 21b (forward) Don't break between SY and HL
+# (break between HL and SY already disallowed by LB 13 above)
+$SYcm $HLcm;
+
+# LB 22
+($ALcm | $HLcm) $INcm;
+$CM+     $INcm;     #  by rule 10, any otherwise unattached CM behaves as AL
+$IDcm    $INcm;
+$INcm    $INcm;
+$NUcm    $INcm;
+
+
+# $LB 23
+$IDcm  $POcm;
+$ALcm  $NUcm;       # includes $LB19
+$HLcm  $NUcm;
+$CM+   $NUcm;       # Rule 10, any otherwise unattached CM behaves as AL
+$NUcm  $ALcm;
+$NUcm  $HLcm;
+
+#
+# LB 24
+#
+$PRcm $IDcm;
+$PRcm ($ALcm | $HLcm);
+$POcm ($ALcm | $HLcm);
+
+#
+# LB 25   Numbers.
+#
+($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
+
+# LB 26  Do not break a Korean syllable
+#
+$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
+($JVcm | $H2cm) ($JVcm | $JTcm);
+($JTcm | $H3cm) $JTcm;
+
+# LB 27  Treat korean Syllable Block the same as ID  (don't break it)
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
+$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
+
+
+# LB 28   Do not break between alphabetics
+#
+($ALcm | $HLcm) ($ALcm | $HLcm);
+$CM+ ($ALcm | $HLcm);      # The $CM+ is from rule 10, an unattached CM is treated as AL
+
+# LB 29
+$IScm ($ALcm | $HLcm);
+
+# LB 30
+($ALcm | $HLcm | $NUcm) $OPcm;
+$CM+ $OPcm;         # The $CM+ is from rule 10, an unattached CM is treated as AL.          
+$CPcm ($ALcm | $HLcm | $NUcm);
+
+# LB 30a  Do not break between regional indicators.
+$RIcm $RIcm;
+
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
+#
+#  Reverse Rules.
+#
+## -------------------------------------------------
+
+!!reverse;
+
+$CM+ $ALPlus;
+$CM+ $BA;
+$CM+ $BB;
+$CM+ $B2;
+$CM+ $CL;
+$CM+ $CP;
+$CM+ $EX;
+$CM+ $GL;
+$CM+ $HL;
+$CM+ $HY;
+$CM+ $H2;
+$CM+ $H3;
+$CM+ $ID;
+$CM+ $IN;
+$CM+ $IS;
+$CM+ $JL;
+$CM+ $JV;
+$CM+ $JT;
+$CM+ $NS;
+$CM+ $NU;
+$CM+ $OP;
+$CM+ $PO;
+$CM+ $PR;
+$CM+ $QU;
+$CM+ $RI;
+$CM+ $SY;
+$CM+ $WJ;
+$CM+;
+
+
+#
+#  Sequences of the form  (shown forwards)
+#      [CANT_CM]  <break>  [CM]  [whatever]
+#  The CM needs to behave as an AL
+#
+$AL_FOLLOW $CM+ / (
+          [$BK $CR $LF $NL $ZW {eof}] |
+          $SP+ $CM+ $SP |
+          $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}]));   # if LB 14 will match, need to surpress this break.
+                                               #  LB14 says    OP SP* x .        
+                                               #    becomes    OP SP* x AL
+                                               #    becomes    OP SP* x CM+ AL_FOLLOW
+                                               #
+                                               # Further note:  the $AL in [$AL {eof}] is only to work around
+                                               #                a rule compiler bug which complains about
+                                               #                empty sets otherwise.
+          
+#
+#  Sequences of the form  (shown forwards)
+#      [CANT_CM]  <break> [CM]  <break>  [PR]
+#  The CM needs to behave as an AL
+#  This rule is concerned about getting the second of the two <breaks> in place.
+#
+
+[$PR   ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
+
+
+
+# LB 4, 5, 5
+
+$LB4Breaks [$LB4NonBreaks-$CM];
+$LB4Breaks $CM+ $CAN_CM;
+$LF $CR;
+
+
+# LB 7         x SP
+#              x ZW
+[$SP $ZW] [$LB4NonBreaks-$CM];
+[$SP $ZW] $CM+ $CAN_CM;
+
+# LB 8 ZW SP* <break>
+#     TODO: to implement this, we need more than one look-ahead hard break in play at a time.
+#           Requires an engine enhancement.
+#   / $SP* $ZW
+
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
+# LB 9,10  Combining marks.
+#    X   $CM needs to behave like X, where X is not $SP or controls.
+#    $CM not covered by the above needs to behave like $AL
+# Stick together any combining sequences that don't match other rules.
+$CM+ $CAN_CM;
+
+
+# LB 11
+$CM* $WJ $CM* $CAN_CM;
+$CM* $WJ      [$LB8NonBreaks-$CM];
+
+     $CANT_CM $CM* $WJ;
+$CM* $CAN_CM  $CM* $WJ;
+
+# LB 12a
+#      [^SP BA HY] x GL
+#
+$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
+
+# LB 12
+#     GL  x
+#
+$CANT_CM $CM* $GL;
+$CM* $CAN_CM $CM* $GL;
+
+
+# LB 13
+$CL $CM+ $CAN_CM;
+$CP $CM+ $CAN_CM;
+$EX $CM+ $CAN_CM;
+$IS $CM+ $CAN_CM;
+$SY $CM+ $CAN_CM;
+
+$CL [$LB8NonBreaks-$CM];
+$CP [$LB8NonBreaks-$CM];
+$EX [$LB8NonBreaks-$CM];
+$IS [$LB8NonBreaks-$CM];
+$SY [$LB8NonBreaks-$CM];
+
+# Rule 13 & 14 taken together for an edge case.
+#   Match this, shown forward
+#     OP SP+  ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
+#   This really wants to chain at the $CM+ (which is acting as an $AL)
+#   except for $CM chaining being disabled.
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;  
+
+# LB 14    OP SP* x
+#
+$CM* $CAN_CM    $SP* $CM* $OP;
+     $CANT_CM   $SP* $CM* $OP;
+$AL_FOLLOW? $CM+  $SP $SP* $CM* $OP;     #  by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
+     
+     $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$CM* $AL_FOLLOW_CM   $CM+ $SP+ $CM* $OP;
+$SY $CM $SP+ $OP;   # TODO:  Experiment.  Remove.
+
+
+
+# LB 15
+$CM* $OP $SP* $CM* $QU;
+
+# LB 16
+$CM* $NS $SP* $CM* ($CL | $CP);
+
+# LB 17
+$CM* $B2 $SP* $CM* $B2;
+
+# LB 18  break after spaces
+#        Nothing explicit needed here.
+
+
+#
+# LB 19
+#
+$CM* $QU $CM* $CAN_CM;                                #   . x QU
+$CM* $QU      $LB18NonBreaks;
+
+
+$CM* $CAN_CM  $CM* $QU;                               #   QU x .
+     $CANT_CM $CM* $QU;
+     
+#
+#  LB 20  Break before and after CB.
+#         nothing needed here.
+#
+
+# LB 21
+$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM];     #  . x (BA | HY | NS)
+
+$CM* [$LB20NonBreaks-$CM] $CM* $BB;                   #  BB x .
+[^$CB] $CM* $BB;                                      # 
+
+# LB21a
+[^$CB] $CM* ($HY | $BA) $CM* $HL;
+
+# LB21b (reverse)
+$CM* $HL $CM* $SY;
+
+# LB 22
+$CM* $IN $CM* ($ALPlus | $HL);
+$CM* $IN $CM* $ID;
+$CM* $IN $CM* $IN;
+$CM* $IN $CM* $NU;
+
+# LB 23
+$CM* $PO $CM* $ID;
+$CM* $NU $CM* ($ALPlus | $HL);
+$CM* ($ALPlus | $HL) $CM* $NU;
+
+# LB 24
+$CM* $ID $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PO;
+
+
+# LB 25
+($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
+
+# LB 26
+$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
+$CM* ($JT | $JV) $CM* ($H2 | $JV);
+$CM* $JT $CM* ($H3 | $JT);
+
+# LB 27
+$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+
+# LB 28
+$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+
+
+# LB 29
+$CM* ($ALPlus | $HL) $CM* $IS;
+
+# LB 30
+$CM* $OP $CM* ($ALPlus | $HL | $NU);
+$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+
+# LB 30a
+$CM* $RI $CM* $RI;
+
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# LB 9
+$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+$CM+ $SP / .;
+
+# LB 14
+$SP+ $CM* $OP;
+
+# LB 15
+$SP+ $CM* $QU;
+
+# LB 16
+$SP+ $CM* ($CL | $CP);
+
+# LB 17
+$SP+ $CM* $B2;
+
+# LB 21
+$CM* ($HY | $BA) $CM* $HL;
+
+# LB 25
+($CM* ($IS | $SY))+ $CM* $NU;
+($CL | $CP) $CM* ($NU | $IS | $SY);
+
+# For dictionary-based break
+$dictionary $dictionary;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# Skip forward over all character classes that are involved in
+#   rules containing patterns with possibly more than one char
+#   of context.
+#
+#  It might be slightly more efficient to have specific rules
+#  instead of one generic one, but only if we could
+#  turn off rule chaining.  We don't want to move more
+#  than necessary.
+#
+[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
+$dictionary $dictionary;
+
diff --git a/icuSources/data/brkitr/line_normal_cj.txt b/icuSources/data/brkitr/line_normal_cj.txt

new file mode 100644 (file)

index 0000000..9f365b8
--- /dev/null
+++ b/icuSources/data/brkitr/line_normal_cj.txt
@@ -0,0 +1,719 @@
+# Copyright (c) 2002-2015  International Business Machines Corporation and
+# others. All Rights Reserved.
+#
+#  file:  line_normal_cj.txt
+#
+#         Line Breaking Rules
+#         Implement default line breaking as defined by 
+#         Unicode Standard Annex #14 Revision 29 for Unicode 6.2
+#         http://www.unicode.org/reports/tr14/
+#         tailored as noted in 2nd paragraph below..
+#
+#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
+#         This is only because of a limitation of ICU break engine implementation,
+#         not because the older behavior is desirable.
+#
+#         This tailors the line break behavior to correspond to CSS
+#         line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
+#         It sets characters of class CJ to behave like ID.
+#         In addition, it allows breaks:
+#         * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+
+#
+#  Character Classes defined by TR 14.
+#
+
+!!chain;
+!!LBCMNoChain;
+
+
+!!lookAheadHardBreak;
+#
+#  !!lookAheadHardBreak    Described here because it is (as yet) undocumented elsewhere
+#                          and only used for the line break rules.
+#
+#           It is used in the implementation of rule LB 10
+#           which says to treat any combining mark that is not attached to a base
+#           character as if it were of class AL  (alphabetic).
+#
+#           The problem occurs in the reverse rules.
+#
+#           Consider a sequence like, with correct breaks as shown
+#               LF  ID  CM  AL  AL
+#                  ^       ^       ^
+#           Then consider the sequence without the initial ID (ideographic)
+#                 LF  CM  AL  AL
+#                    ^           ^
+#           Our CM, which in the first example was attached to the ideograph,
+#           is now unattached, becomes an alpha, and joins in with the other
+#           alphas.
+#
+#           When iterating forwards, these sequences do not present any problems
+#           When iterating backwards, we need to look ahead when encountering
+#           a CM to see whether it attaches to something further on or not.
+#           (Look-ahead in a reverse rule is looking towards the start)
+#
+#           If the CM is unattached, we need to force a break.
+#
+#           !!lookAheadHardBreak forces the run time state machine to
+#           stop immediately when a look ahead rule ( '/' operator) matches,
+#           and set the match position to that of the look-ahead operator,
+#           no matter what other rules may be in play at the time.
+#
+#           See rule LB 19 for an example.
+#
+
+$AI = [:LineBreak =  Ambiguous:];
+$AL = [:LineBreak =  Alphabetic:];
+$BAX = [\u2010 \u2013];
+$BA = [[:LineBreak =  Break_After:] - $BAX];
+$BB = [:LineBreak =  Break_Before:];
+$BK = [:LineBreak =  Mandatory_Break:];
+$B2 = [:LineBreak =  Break_Both:];
+$CB = [:LineBreak =  Contingent_Break:];
+$CJ = [:LineBreak =  Conditional_Japanese_Starter:];
+$CL = [:LineBreak =  Close_Punctuation:];
+$CM = [:LineBreak =  Combining_Mark:];
+$CP = [:LineBreak =  Close_Parenthesis:];
+$CR = [:LineBreak =  Carriage_Return:];
+$EX = [:LineBreak =  Exclamation:];
+$GL = [:LineBreak =  Glue:];
+$HL = [:LineBreak =  Hebrew_Letter:];
+$HY = [:LineBreak =  Hyphen:];
+$H2 = [:LineBreak =  H2:];
+$H3 = [:LineBreak =  H3:];
+$ID = [[:LineBreak =  Ideographic:] $CJ];
+$IN = [:LineBreak =  Inseperable:];
+$IS = [:LineBreak =  Infix_Numeric:];
+$JL = [:LineBreak =  JL:];
+$JV = [:LineBreak =  JV:];
+$JT = [:LineBreak =  JT:];
+$LF = [:LineBreak =  Line_Feed:];
+$NL = [:LineBreak =  Next_Line:];
+$NSX = [\u301C \u30A0];
+$NS = [[:LineBreak =  Nonstarter:] - $NSX];
+$NU = [:LineBreak =  Numeric:];
+$OP = [:LineBreak =  Open_Punctuation:];
+$PO = [:LineBreak =  Postfix_Numeric:];
+$PR = [:LineBreak =  Prefix_Numeric:];
+$QU = [:LineBreak =  Quotation:];
+$RI = [:LineBreak =  Regional_Indicator:];
+$SA = [:LineBreak =  Complex_Context:];
+$SG = [:LineBreak =  Surrogate:];
+$SP = [:LineBreak =  Space:];
+$SY = [:LineBreak =  Break_Symbols:];
+$WJ = [:LineBreak =  Word_Joiner:];
+$XX = [:LineBreak =  Unknown:];
+$ZW = [:LineBreak =  ZWSpace:];
+
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
+#   Dictionary character set, for triggering language-based break engines. Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include all
+#   characters requiring dictionary break.
+
+$dictionary = [:LineBreak = Complex_Context:];
+
+#
+#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
+#                               SA  (South East Asian: Thai, Lao, Khmer)
+#                               SG  (Unpaired Surrogates)
+#                               XX  (Unknown, unassigned)
+#                         as $AL  (Alphabetic)
+#
+$ALPlus = [$AL $AI $SA $SG $XX];
+
+#
+#  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
+#
+$ALcm = $ALPlus $CM*;
+$BAcm = $BA $CM*;
+$BAXcm = $BAX $CM*;
+$BBcm = $BB $CM*;
+$B2cm = $B2 $CM*;
+$CLcm = $CL $CM*;
+$CPcm = $CP $CM*;
+$EXcm = $EX $CM*;
+$GLcm = $GL $CM*;
+$HLcm = $HL $CM*;
+$HYcm = $HY $CM*;
+$H2cm = $H2 $CM*;
+$H3cm = $H3 $CM*;
+$IDcm = $ID $CM*;
+$INcm = $IN $CM*;
+$IScm = $IS $CM*;
+$JLcm = $JL $CM*;
+$JVcm = $JV $CM*;
+$JTcm = $JT $CM*;
+$NScm = $NS $CM*;
+$NSXcm = $NSX $CM*;
+$NUcm = $NU $CM*;
+$OPcm = $OP $CM*;
+$POcm = $PO $CM*;
+$PRcm = $PR $CM*;
+$QUcm = $QU $CM*;
+$RIcm = $RI $CM*;
+$SYcm = $SY $CM*;
+$WJcm = $WJ $CM*;
+
+## -------------------------------------------------
+
+!!forward;
+
+#
+#  Each class of character can stand by itself as an unbroken token, with trailing combining stuff
+#
+$ALPlus $CM+;
+$BA $CM+;
+$BAX $CM+;
+$BB $CM+;
+$B2 $CM+;
+$CL $CM+;
+$CP $CM+;
+$EX $CM+;
+$GL $CM+;
+$HL $CM+;
+$HY $CM+;
+$H2 $CM+;
+$H3 $CM+;
+$ID $CM+;
+$IN $CM+;
+$IS $CM+;
+$JL $CM+;
+$JV $CM+;
+$JT $CM+;
+$NS $CM+;
+$NSX $CM+;
+$NU $CM+;
+$OP $CM+;
+$PO $CM+;
+$PR $CM+;
+$QU $CM+;
+$RI $CM+;
+$SY $CM+;
+$WJ $CM+;
+
+#
+# CAN_CM  is the set of characters that may combine with CM combining chars.
+#         Note that Linebreak UAX 14's concept of a combining char and the rules
+#         for what they can combine with are _very_ different from the rest of Unicode.
+#
+#         Note that $CM itself is left out of this set.  If CM is needed as a base
+#         it must be listed separately in the rule.
+#
+$CAN_CM  = [^$SP $BK $CR $LF $NL $ZW $CM];       # Bases that can   take CMs
+$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM];       # Bases that can't take CMs
+
+#
+# AL_FOLLOW  set of chars that can unconditionally follow an AL
+#            Needed in rules where stand-alone $CM s are treated as AL.
+#            Chaining is disabled with CM because it causes other failures,
+#            so for this one case we need to manually list out longer sequences.
+#
+$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
+$AL_FOLLOW_CM   = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $BAX $HY $NS $NSX $IN $NU $ALPlus];
+$AL_FOLLOW      = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
+
+
+#
+#  Rule LB 4, 5    Mandatory (Hard) breaks.
+#
+$LB4Breaks    = [$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL];
+$CR $LF {100};
+
+#
+#  LB 6    Do not break before hard line breaks.
+#
+$LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
+$CAN_CM $CM*    $LB4Breaks {100};
+$CM+            $LB4Breaks {100};
+
+# LB 7         x SP
+#              x ZW
+$LB4NonBreaks [$SP $ZW];
+$CAN_CM $CM*  [$SP $ZW];
+$CM+          [$SP $ZW];
+
+#
+# LB 8         Break after zero width space
+#              TODO:  ZW SP* <break>
+#              An engine change is required to write the reverse rule for this.
+#              For now, leave the Unicode 5.2 rule, ZW <break>
+#
+$LB8Breaks    = [$LB4Breaks $ZW];
+$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+
+
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
+# LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL 
+#                                $CM not covered by the above needs to behave like $AL   
+#                                See definition of $CAN_CM.
+
+$CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
+$CM+;
+
+#
+# LB 11  Do not break before or after WORD JOINER & related characters.
+#
+$CAN_CM $CM*  $WJcm;
+$LB8NonBreaks $WJcm;
+$CM+          $WJcm;
+
+$WJcm $CANT_CM;
+$WJcm $CAN_CM $CM*;
+
+#
+# LB 12  Do not break after NBSP and related characters.
+#         GL  x
+#
+$GLcm $CAN_CM $CM*;
+$GLcm $CANT_CM;
+ 
+#
+# LB 12a  Do not break before NBSP and related characters ...
+#            [^SP BA HY] x GL
+#
+[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm;
+$CM+ GLcm;
+
+
+
+#
+# LB 13   Don't break before ']' or '!' or ';' or '/', even after spaces.
+#
+$LB8NonBreaks $CL;
+$CAN_CM $CM*  $CL;
+$CM+          $CL;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $CP;
+$CAN_CM $CM*  $CP;
+$CM+          $CP;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $EX;
+$CAN_CM $CM*  $EX;
+$CM+          $EX;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $IS;
+$CAN_CM $CM*  $IS;
+$CM+          $IS;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $SY;
+$CAN_CM $CM*  $SY;
+$CM+          $SY;              # by rule 10, stand-alone CM behaves as AL
+
+
+#
+# LB 14  Do not break after OP, even after spaces
+#
+$OPcm $SP* $CAN_CM $CM*;
+$OPcm $SP* $CANT_CM;
+
+$OPcm $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
+
+# LB 15
+$QUcm $SP* $OPcm;
+
+# LB 16
+# Do not break between closing punctuation and $NS, even with intervening spaces
+# But DO allow a break between closing punctuation and $NSX, don't include it here
+($CLcm | $CPcm) $SP* $NScm;
+
+# LB 17
+$B2cm $SP* $B2cm;
+
+#
+# LB 18  Break after spaces.
+#
+$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
+$LB18Breaks    = [$LB8Breaks $SP];
+
+
+# LB 19
+#         x QU
+$LB18NonBreaks $CM* $QUcm;
+$CM+                $QUcm;
+
+#         QU  x
+$QUcm .?;
+$QUcm $LB18NonBreaks $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
+                              #  TODO:  I don't think this rule is needed.
+
+
+# LB 20
+#        <break>  $CB
+#        $CB   <break>
+
+$LB20NonBreaks = [$LB18NonBreaks - $CB];
+
+# LB 21        x   (BA | HY | NS)
+#           BB x
+#
+# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
+$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); 
+
+$BBcm [^$CB];                                  #  $BB  x
+$BBcm $LB20NonBreaks $CM*;
+
+# LB 21a Don't break after Hebrew + Hyphen
+#   HL (HY | BA) x
+#  
+$HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?;
+
+# LB 21b (forward) Don't break between SY and HL
+# (break between HL and SY already disallowed by LB 13 above)
+$SYcm $HLcm;
+
+# LB 22
+($ALcm | $HLcm) $INcm;
+$CM+     $INcm;     #  by rule 10, any otherwise unattached CM behaves as AL
+$IDcm    $INcm;
+$INcm    $INcm;
+$NUcm    $INcm;
+
+
+# $LB 23
+$IDcm  $POcm;
+$ALcm  $NUcm;       # includes $LB19
+$HLcm  $NUcm;
+$CM+   $NUcm;       # Rule 10, any otherwise unattached CM behaves as AL
+$NUcm  $ALcm;
+$NUcm  $HLcm;
+
+#
+# LB 24
+#
+$PRcm $IDcm;
+$PRcm ($ALcm | $HLcm);
+$POcm ($ALcm | $HLcm);
+
+#
+# LB 25   Numbers.
+#
+($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
+
+# LB 26  Do not break a Korean syllable
+#
+$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
+($JVcm | $H2cm) ($JVcm | $JTcm);
+($JTcm | $H3cm) $JTcm;
+
+# LB 27  Treat korean Syllable Block the same as ID  (don't break it)
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
+$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
+
+
+# LB 28   Do not break between alphabetics
+#
+($ALcm | $HLcm) ($ALcm | $HLcm);
+$CM+ ($ALcm | $HLcm);      # The $CM+ is from rule 10, an unattached CM is treated as AL
+
+# LB 29
+$IScm ($ALcm | $HLcm);
+
+# LB 30
+($ALcm | $HLcm | $NUcm) $OPcm;
+$CM+ $OPcm;         # The $CM+ is from rule 10, an unattached CM is treated as AL.          
+$CPcm ($ALcm | $HLcm | $NUcm);
+
+# LB 30a  Do not break between regional indicators.
+$RIcm $RIcm;
+
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
+#
+#  Reverse Rules.
+#
+## -------------------------------------------------
+
+!!reverse;
+
+$CM+ $ALPlus;
+$CM+ $BA;
+$CM+ $BAX;
+$CM+ $BB;
+$CM+ $B2;
+$CM+ $CL;
+$CM+ $CP;
+$CM+ $EX;
+$CM+ $GL;
+$CM+ $HL;
+$CM+ $HY;
+$CM+ $H2;
+$CM+ $H3;
+$CM+ $ID;
+$CM+ $IN;
+$CM+ $IS;
+$CM+ $JL;
+$CM+ $JV;
+$CM+ $JT;
+$CM+ $NS;
+$CM+ $NSX;
+$CM+ $NU;
+$CM+ $OP;
+$CM+ $PO;
+$CM+ $PR;
+$CM+ $QU;
+$CM+ $RI;
+$CM+ $SY;
+$CM+ $WJ;
+$CM+;
+
+
+#
+#  Sequences of the form  (shown forwards)
+#      [CANT_CM]  <break>  [CM]  [whatever]
+#  The CM needs to behave as an AL
+#
+$AL_FOLLOW $CM+ / (
+          [$BK $CR $LF $NL $ZW {eof}] |
+          $SP+ $CM+ $SP |
+          $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}]));   # if LB 14 will match, need to surpress this break.
+                                               #  LB14 says    OP SP* x .        
+                                               #    becomes    OP SP* x AL
+                                               #    becomes    OP SP* x CM+ AL_FOLLOW
+                                               #
+                                               # Further note:  the $AL in [$AL {eof}] is only to work around
+                                               #                a rule compiler bug which complains about
+                                               #                empty sets otherwise.
+          
+#
+#  Sequences of the form  (shown forwards)
+#      [CANT_CM]  <break> [CM]  <break>  [PR]
+#  The CM needs to behave as an AL
+#  This rule is concerned about getting the second of the two <breaks> in place.
+#
+
+[$PR   ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
+
+
+
+# LB 4, 5, 5
+
+$LB4Breaks [$LB4NonBreaks-$CM];
+$LB4Breaks $CM+ $CAN_CM;
+$LF $CR;
+
+
+# LB 7         x SP
+#              x ZW
+[$SP $ZW] [$LB4NonBreaks-$CM];
+[$SP $ZW] $CM+ $CAN_CM;
+
+# LB 8 ZW SP* <break>
+#     TODO: to implement this, we need more than one look-ahead hard break in play at a time.
+#           Requires an engine enhancement.
+#   / $SP* $ZW
+
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
+# LB 9,10  Combining marks.
+#    X   $CM needs to behave like X, where X is not $SP or controls.
+#    $CM not covered by the above needs to behave like $AL
+# Stick together any combining sequences that don't match other rules.
+$CM+ $CAN_CM;
+
+
+# LB 11
+$CM* $WJ $CM* $CAN_CM;
+$CM* $WJ      [$LB8NonBreaks-$CM];
+
+     $CANT_CM $CM* $WJ;
+$CM* $CAN_CM  $CM* $WJ;
+
+# LB 12a
+#      [^SP BA HY] x GL
+#
+$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
+
+# LB 12
+#     GL  x
+#
+$CANT_CM $CM* $GL;
+$CM* $CAN_CM $CM* $GL;
+
+
+# LB 13
+$CL $CM+ $CAN_CM;
+$CP $CM+ $CAN_CM;
+$EX $CM+ $CAN_CM;
+$IS $CM+ $CAN_CM;
+$SY $CM+ $CAN_CM;
+
+$CL [$LB8NonBreaks-$CM];
+$CP [$LB8NonBreaks-$CM];
+$EX [$LB8NonBreaks-$CM];
+$IS [$LB8NonBreaks-$CM];
+$SY [$LB8NonBreaks-$CM];
+
+# Rule 13 & 14 taken together for an edge case.
+#   Match this, shown forward
+#     OP SP+  ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
+#   This really wants to chain at the $CM+ (which is acting as an $AL)
+#   except for $CM chaining being disabled.
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;  
+
+# LB 14    OP SP* x
+#
+$CM* $CAN_CM    $SP* $CM* $OP;
+     $CANT_CM   $SP* $CM* $OP;
+$AL_FOLLOW? $CM+  $SP $SP* $CM* $OP;     #  by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
+     
+     $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$CM* $AL_FOLLOW_CM   $CM+ $SP+ $CM* $OP;
+$SY $CM $SP+ $OP;   # TODO:  Experiment.  Remove.
+
+
+
+# LB 15
+$CM* $OP $SP* $CM* $QU;
+
+# LB 16
+# Don't include $NSX here
+$CM* $NS $SP* $CM* ($CL | $CP);
+
+# LB 17
+$CM* $B2 $SP* $CM* $B2;
+
+# LB 18  break after spaces
+#        Nothing explicit needed here.
+
+
+#
+# LB 19
+#
+$CM* $QU $CM* $CAN_CM;                                #   . x QU
+$CM* $QU      $LB18NonBreaks;
+
+
+$CM* $CAN_CM  $CM* $QU;                               #   QU x .
+     $CANT_CM $CM* $QU;
+     
+#
+#  LB 20  Break before and after CB.
+#         nothing needed here.
+#
+
+# LB 21
+# Don't include $BAX or $NSX here
+$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM];     #  . x (BA | HY | NS)
+
+$CM* [$LB20NonBreaks-$CM] $CM* $BB;                   #  BB x .
+[^$CB] $CM* $BB;                                      # 
+
+# LB21a
+[^$CB] $CM* ($HY | $BA | $BAX) $CM* $HL;
+
+# LB21b (reverse)
+$CM* $HL $CM* $SY;
+
+# LB 22
+$CM* $IN $CM* ($ALPlus | $HL);
+$CM* $IN $CM* $ID;
+$CM* $IN $CM* $IN;
+$CM* $IN $CM* $NU;
+
+# LB 23
+$CM* $PO $CM* $ID;
+$CM* $NU $CM* ($ALPlus | $HL);
+$CM* ($ALPlus | $HL) $CM* $NU;
+
+# LB 24
+$CM* $ID $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PO;
+
+
+# LB 25
+($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
+
+# LB 26
+$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
+$CM* ($JT | $JV) $CM* ($H2 | $JV);
+$CM* $JT $CM* ($H3 | $JT);
+
+# LB 27
+$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+
+# LB 28
+$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+
+
+# LB 29
+$CM* ($ALPlus | $HL) $CM* $IS;
+
+# LB 30
+$CM* $OP $CM* ($ALPlus | $HL | $NU);
+$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+
+# LB 30a
+$CM* $RI $CM* $RI;
+
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# LB 9
+$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+$CM+ $SP / .;
+
+# LB 14
+$SP+ $CM* $OP;
+
+# LB 15
+$SP+ $CM* $QU;
+
+# LB 16
+$SP+ $CM* ($CL | $CP);
+
+# LB 17
+$SP+ $CM* $B2;
+
+# LB 21
+$CM* ($HY | $BA) $CM* $HL;
+
+# LB 25
+($CM* ($IS | $SY))+ $CM* $NU;
+($CL | $CP) $CM* ($NU | $IS | $SY);
+
+# For dictionary-based break
+$dictionary $dictionary;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# Skip forward over all character classes that are involved in
+#   rules containing patterns with possibly more than one char
+#   of context.
+#
+#  It might be slightly more efficient to have specific rules
+#  instead of one generic one, but only if we could
+#  turn off rule chaining.  We don't want to move more
+#  than necessary.
+#
+[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
+$dictionary $dictionary;
+
diff --git a/icuSources/data/brkitr/line_normal_fi.txt b/icuSources/data/brkitr/line_normal_fi.txt

new file mode 100644 (file)

index 0000000..f84e8b6
--- /dev/null
+++ b/icuSources/data/brkitr/line_normal_fi.txt
@@ -0,0 +1,715 @@
+# Copyright (c) 2002-2015  International Business Machines Corporation and
+# others. All Rights Reserved.
+#
+#  file:  line_normal_fi.txt
+#
+#         Line Breaking Rules
+#         Implement default line breaking as defined by 
+#         Unicode Standard Annex #14 Revision 29 for Unicode 6.2
+#         http://www.unicode.org/reports/tr14/
+#         tailored as noted in 2nd paragraph below..
+#
+#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
+#         This is only because of a limitation of ICU break engine implementation,
+#         not because the older behavior is desirable.
+#
+#         This tailors the line break behavior both for Finnish and to correpond to CSS
+#         line-break=normal (BCP47 -u-lb-normal) as defined for languages other than 
+#         Chinese & Japanese.
+#         It sets characters of class CJ to behave like ID.
+
+#
+#  Character Classes defined by TR 14.
+#
+
+!!chain;
+!!LBCMNoChain;
+
+
+!!lookAheadHardBreak;
+#
+#  !!lookAheadHardBreak    Described here because it is (as yet) undocumented elsewhere
+#                          and only used for the line break rules.
+#
+#           It is used in the implementation of rule LB 10
+#           which says to treat any combining mark that is not attached to a base
+#           character as if it were of class AL  (alphabetic).
+#
+#           The problem occurs in the reverse rules.
+#
+#           Consider a sequence like, with correct breaks as shown
+#               LF  ID  CM  AL  AL
+#                  ^       ^       ^
+#           Then consider the sequence without the initial ID (ideographic)
+#                 LF  CM  AL  AL
+#                    ^           ^
+#           Our CM, which in the first example was attached to the ideograph,
+#           is now unattached, becomes an alpha, and joins in with the other
+#           alphas.
+#
+#           When iterating forwards, these sequences do not present any problems
+#           When iterating backwards, we need to look ahead when encountering
+#           a CM to see whether it attaches to something further on or not.
+#           (Look-ahead in a reverse rule is looking towards the start)
+#
+#           If the CM is unattached, we need to force a break.
+#
+#           !!lookAheadHardBreak forces the run time state machine to
+#           stop immediately when a look ahead rule ( '/' operator) matches,
+#           and set the match position to that of the look-ahead operator,
+#           no matter what other rules may be in play at the time.
+#
+#           See rule LB 19 for an example.
+#
+
+$AI = [:LineBreak =  Ambiguous:];
+$AL = [:LineBreak =  Alphabetic:];
+$BA = [[:LineBreak =  Break_After:] - [\u2010]];
+$HH = [\u2010];
+$BB = [:LineBreak =  Break_Before:];
+$BK = [:LineBreak =  Mandatory_Break:];
+$B2 = [:LineBreak =  Break_Both:];
+$CB = [:LineBreak =  Contingent_Break:];
+$CJ = [:LineBreak =  Conditional_Japanese_Starter:];
+$CL = [:LineBreak =  Close_Punctuation:];
+$CM = [:LineBreak =  Combining_Mark:];
+$CP = [:LineBreak =  Close_Parenthesis:];
+$CR = [:LineBreak =  Carriage_Return:];
+$EX = [:LineBreak =  Exclamation:];
+$GL = [:LineBreak =  Glue:];
+$HL = [:LineBreak =  Hebrew_Letter:];
+$HY = [:LineBreak =  Hyphen:];
+$H2 = [:LineBreak =  H2:];
+$H3 = [:LineBreak =  H3:];
+$ID = [[:LineBreak =  Ideographic:] $CJ];
+$IN = [:LineBreak =  Inseperable:];
+$IS = [:LineBreak =  Infix_Numeric:];
+$JL = [:LineBreak =  JL:];
+$JV = [:LineBreak =  JV:];
+$JT = [:LineBreak =  JT:];
+$LF = [:LineBreak =  Line_Feed:];
+$NL = [:LineBreak =  Next_Line:];
+$NS = [:LineBreak =  Nonstarter:];
+$NU = [:LineBreak =  Numeric:];
+$OP = [:LineBreak =  Open_Punctuation:];
+$PO = [:LineBreak =  Postfix_Numeric:];
+$PR = [:LineBreak =  Prefix_Numeric:];
+$QU = [:LineBreak =  Quotation:];
+$RI = [:LineBreak =  Regional_Indicator:];
+$SA = [:LineBreak =  Complex_Context:];
+$SG = [:LineBreak =  Surrogate:];
+$SP = [:LineBreak =  Space:];
+$SY = [:LineBreak =  Break_Symbols:];
+$WJ = [:LineBreak =  Word_Joiner:];
+$XX = [:LineBreak =  Unknown:];
+$ZW = [:LineBreak =  ZWSpace:];
+
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
+#   Dictionary character set, for triggering language-based break engines. Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include all
+#   characters requiring dictionary break.
+
+$dictionary = [:LineBreak = Complex_Context:];
+
+#
+#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
+#                               SA  (South East Asian: Thai, Lao, Khmer)
+#                               SG  (Unpaired Surrogates)
+#                               XX  (Unknown, unassigned)
+#                         as $AL  (Alphabetic)
+#
+$ALPlus = [$AL $AI $SA $SG $XX];
+
+#
+#  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
+#
+$ALcm = $ALPlus $CM*;
+$BAcm = $BA $CM*;
+$HHcm = $HH $CM*;
+$BBcm = $BB $CM*;
+$B2cm = $B2 $CM*;
+$CLcm = $CL $CM*;
+$CPcm = $CP $CM*;
+$EXcm = $EX $CM*;
+$GLcm = $GL $CM*;
+$HLcm = $HL $CM*;
+$HYcm = $HY $CM*;
+$H2cm = $H2 $CM*;
+$H3cm = $H3 $CM*;
+$IDcm = $ID $CM*;
+$INcm = $IN $CM*;
+$IScm = $IS $CM*;
+$JLcm = $JL $CM*;
+$JVcm = $JV $CM*;
+$JTcm = $JT $CM*;
+$NScm = $NS $CM*;
+$NUcm = $NU $CM*;
+$OPcm = $OP $CM*;
+$POcm = $PO $CM*;
+$PRcm = $PR $CM*;
+$QUcm = $QU $CM*;
+$RIcm = $RI $CM*;
+$SYcm = $SY $CM*;
+$WJcm = $WJ $CM*;
+
+## -------------------------------------------------
+
+!!forward;
+
+#
+#  Each class of character can stand by itself as an unbroken token, with trailing combining stuff
+#
+$ALPlus $CM+;
+$BA $CM+;
+$HH $CM+;
+$BB $CM+;
+$B2 $CM+;
+$CL $CM+;
+$CP $CM+;
+$EX $CM+;
+$GL $CM+;
+$HL $CM+;
+$HY $CM+;
+$H2 $CM+;
+$H3 $CM+;
+$ID $CM+;
+$IN $CM+;
+$IS $CM+;
+$JL $CM+;
+$JV $CM+;
+$JT $CM+;
+$NS $CM+;
+$NU $CM+;
+$OP $CM+;
+$PO $CM+;
+$PR $CM+;
+$QU $CM+;
+$RI $CM+;
+$SY $CM+;
+$WJ $CM+;
+
+#
+# CAN_CM  is the set of characters that may combine with CM combining chars.
+#         Note that Linebreak UAX 14's concept of a combining char and the rules
+#         for what they can combine with are _very_ different from the rest of Unicode.
+#
+#         Note that $CM itself is left out of this set.  If CM is needed as a base
+#         it must be listed separately in the rule.
+#
+$CAN_CM  = [^$SP $BK $CR $LF $NL $ZW $CM];       # Bases that can   take CMs
+$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM];       # Bases that can't take CMs
+
+#
+# AL_FOLLOW  set of chars that can unconditionally follow an AL
+#            Needed in rules where stand-alone $CM s are treated as AL.
+#            Chaining is disabled with CM because it causes other failures,
+#            so for this one case we need to manually list out longer sequences.
+#
+$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
+$AL_FOLLOW_CM   = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $ALPlus];
+$AL_FOLLOW      = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
+
+
+#
+#  Rule LB 4, 5    Mandatory (Hard) breaks.
+#
+$LB4Breaks    = [$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL];
+$CR $LF {100};
+
+#
+#  LB 6    Do not break before hard line breaks.
+#
+$LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
+$CAN_CM $CM*    $LB4Breaks {100};
+$CM+            $LB4Breaks {100};
+
+# LB 7         x SP
+#              x ZW
+$LB4NonBreaks [$SP $ZW];
+$CAN_CM $CM*  [$SP $ZW];
+$CM+          [$SP $ZW];
+
+#
+# LB 8         Break after zero width space
+#              TODO:  ZW SP* <break>
+#              An engine change is required to write the reverse rule for this.
+#              For now, leave the Unicode 5.2 rule, ZW <break>
+#
+$LB8Breaks    = [$LB4Breaks $ZW];
+$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+
+
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
+# LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL 
+#                                $CM not covered by the above needs to behave like $AL   
+#                                See definition of $CAN_CM.
+
+$CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
+$CM+;
+
+#
+# LB 11  Do not break before or after WORD JOINER & related characters.
+#
+$CAN_CM $CM*  $WJcm;
+$LB8NonBreaks $WJcm;
+$CM+          $WJcm;
+
+$WJcm $CANT_CM;
+$WJcm $CAN_CM $CM*;
+
+#
+# LB 12  Do not break after NBSP and related characters.
+#         GL  x
+#
+$GLcm $CAN_CM $CM*;
+$GLcm $CANT_CM;
+ 
+#
+# LB 12a  Do not break before NBSP and related characters ...
+#            [^SP BA HY] x GL
+#
+[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm;
+$CM+ GLcm;
+
+
+
+#
+# LB 13   Don't break before ']' or '!' or ';' or '/', even after spaces.
+#
+$LB8NonBreaks $CL;
+$CAN_CM $CM*  $CL;
+$CM+          $CL;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $CP;
+$CAN_CM $CM*  $CP;
+$CM+          $CP;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $EX;
+$CAN_CM $CM*  $EX;
+$CM+          $EX;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $IS;
+$CAN_CM $CM*  $IS;
+$CM+          $IS;              # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $SY;
+$CAN_CM $CM*  $SY;
+$CM+          $SY;              # by rule 10, stand-alone CM behaves as AL
+
+
+#
+# LB 14  Do not break after OP, even after spaces
+#
+$OPcm $SP* $CAN_CM $CM*;
+$OPcm $SP* $CANT_CM;
+
+$OPcm $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
+
+# LB 15
+$QUcm $SP* $OPcm;
+
+# LB 16
+($CLcm | $CPcm) $SP* $NScm;
+
+# LB 17
+$B2cm $SP* $B2cm;
+
+#
+# LB 18  Break after spaces.
+#
+$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
+$LB18Breaks    = [$LB8Breaks $SP];
+
+
+# LB 19
+#         x QU
+$LB18NonBreaks $CM* $QUcm;
+$CM+                $QUcm;
+
+#         QU  x
+$QUcm .?;
+$QUcm $LB18NonBreaks $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
+                              #  TODO:  I don't think this rule is needed.
+
+
+# LB 20
+#        <break>  $CB
+#        $CB   <break>
+
+$LB20NonBreaks = [$LB18NonBreaks - $CB];
+
+# LB 20.09 added rule for Finnish tailoring
+# LB 21        x   (BA | HY | NS)
+#           BB x
+#
+$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL; 
+$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm); 
+($HY | $HH) $AL;
+
+$BBcm [^$CB];                                  #  $BB  x
+$BBcm $LB20NonBreaks $CM*;
+
+# LB 21a Don't break after Hebrew + Hyphen
+#   HL (HY | BA) x
+#  
+$HLcm ($HYcm | $BAcm | $HHcm) [^$CB]?;
+
+# LB 21b (forward) Don't break between SY and HL
+# (break between HL and SY already disallowed by LB 13 above)
+$SYcm $HLcm;
+
+# LB 22
+($ALcm | $HLcm) $INcm;
+$CM+     $INcm;     #  by rule 10, any otherwise unattached CM behaves as AL
+$IDcm    $INcm;
+$INcm    $INcm;
+$NUcm    $INcm;
+
+
+# $LB 23
+$IDcm  $POcm;
+$ALcm  $NUcm;       # includes $LB19
+$HLcm  $NUcm;
+$CM+   $NUcm;       # Rule 10, any otherwise unattached CM behaves as AL
+$NUcm  $ALcm;
+$NUcm  $HLcm;
+
+#
+# LB 24
+#
+$PRcm $IDcm;
+$PRcm ($ALcm | $HLcm);
+$POcm ($ALcm | $HLcm);
+
+#
+# LB 25   Numbers.
+#
+($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
+
+# LB 26  Do not break a Korean syllable
+#
+$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
+($JVcm | $H2cm) ($JVcm | $JTcm);
+($JTcm | $H3cm) $JTcm;
+
+# LB 27  Treat korean Syllable Block the same as ID  (don't break it)
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
+$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
+
+
+# LB 28   Do not break between alphabetics
+#
+($ALcm | $HLcm) ($ALcm | $HLcm);
+$CM+ ($ALcm | $HLcm);      # The $CM+ is from rule 10, an unattached CM is treated as AL
+
+# LB 29
+$IScm ($ALcm | $HLcm);
+
+# LB 30
+($ALcm | $HLcm | $NUcm) $OPcm;
+$CM+ $OPcm;         # The $CM+ is from rule 10, an unattached CM is treated as AL.          
+$CPcm ($ALcm | $HLcm | $NUcm);
+
+# LB 30a  Do not break between regional indicators.
+$RIcm $RIcm;
+
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
+#
+#  Reverse Rules.
+#
+## -------------------------------------------------
+
+!!reverse;
+
+$CM+ $ALPlus;
+$CM+ $BA;
+$CM+ $HH;
+$CM+ $BB;
+$CM+ $B2;
+$CM+ $CL;
+$CM+ $CP;
+$CM+ $EX;
+$CM+ $GL;
+$CM+ $HL;
+$CM+ $HY;
+$CM+ $H2;
+$CM+ $H3;
+$CM+ $ID;
+$CM+ $IN;
+$CM+ $IS;
+$CM+ $JL;
+$CM+ $JV;
+$CM+ $JT;
+$CM+ $NS;
+$CM+ $NU;
+$CM+ $OP;
+$CM+ $PO;
+$CM+ $PR;
+$CM+ $QU;
+$CM+ $RI;
+$CM+ $SY;
+$CM+ $WJ;
+$CM+;
+
+
+#
+#  Sequences of the form  (shown forwards)
+#      [CANT_CM]  <break>  [CM]  [whatever]
+#  The CM needs to behave as an AL
+#
+$AL_FOLLOW $CM+ / (
+          [$BK $CR $LF $NL $ZW {eof}] |
+          $SP+ $CM+ $SP |
+          $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}]));   # if LB 14 will match, need to surpress this break.
+                                               #  LB14 says    OP SP* x .        
+                                               #    becomes    OP SP* x AL
+                                               #    becomes    OP SP* x CM+ AL_FOLLOW
+                                               #
+                                               # Further note:  the $AL in [$AL {eof}] is only to work around
+                                               #                a rule compiler bug which complains about
+                                               #                empty sets otherwise.
+          
+#
+#  Sequences of the form  (shown forwards)
+#      [CANT_CM]  <break> [CM]  <break>  [PR]
+#  The CM needs to behave as an AL
+#  This rule is concerned about getting the second of the two <breaks> in place.
+#
+
+[$PR   ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
+
+
+
+# LB 4, 5, 5
+
+$LB4Breaks [$LB4NonBreaks-$CM];
+$LB4Breaks $CM+ $CAN_CM;
+$LF $CR;
+
+
+# LB 7         x SP
+#              x ZW
+[$SP $ZW] [$LB4NonBreaks-$CM];
+[$SP $ZW] $CM+ $CAN_CM;
+
+# LB 8 ZW SP* <break>
+#     TODO: to implement this, we need more than one look-ahead hard break in play at a time.
+#           Requires an engine enhancement.
+#   / $SP* $ZW
+
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
+# LB 9,10  Combining marks.
+#    X   $CM needs to behave like X, where X is not $SP or controls.
+#    $CM not covered by the above needs to behave like $AL
+# Stick together any combining sequences that don't match other rules.
+$CM+ $CAN_CM;
+
+
+# LB 11
+$CM* $WJ $CM* $CAN_CM;
+$CM* $WJ      [$LB8NonBreaks-$CM];
+
+     $CANT_CM $CM* $WJ;
+$CM* $CAN_CM  $CM* $WJ;
+
+# LB 12a
+#      [^SP BA HY] x GL
+#
+$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
+
+# LB 12
+#     GL  x
+#
+$CANT_CM $CM* $GL;
+$CM* $CAN_CM $CM* $GL;
+
+
+# LB 13
+$CL $CM+ $CAN_CM;
+$CP $CM+ $CAN_CM;
+$EX $CM+ $CAN_CM;
+$IS $CM+ $CAN_CM;
+$SY $CM+ $CAN_CM;
+
+$CL [$LB8NonBreaks-$CM];
+$CP [$LB8NonBreaks-$CM];
+$EX [$LB8NonBreaks-$CM];
+$IS [$LB8NonBreaks-$CM];
+$SY [$LB8NonBreaks-$CM];
+
+# Rule 13 & 14 taken together for an edge case.
+#   Match this, shown forward
+#     OP SP+  ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
+#   This really wants to chain at the $CM+ (which is acting as an $AL)
+#   except for $CM chaining being disabled.
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;  
+
+# LB 14    OP SP* x
+#
+$CM* $CAN_CM    $SP* $CM* $OP;
+     $CANT_CM   $SP* $CM* $OP;
+$AL_FOLLOW? $CM+  $SP $SP* $CM* $OP;     #  by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
+     
+     $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$CM* $AL_FOLLOW_CM   $CM+ $SP+ $CM* $OP;
+$SY $CM $SP+ $OP;   # TODO:  Experiment.  Remove.
+
+
+
+# LB 15
+$CM* $OP $SP* $CM* $QU;
+
+# LB 16
+$CM* $NS $SP* $CM* ($CL | $CP);
+
+# LB 17
+$CM* $B2 $SP* $CM* $B2;
+
+# LB 18  break after spaces
+#        Nothing explicit needed here.
+
+
+#
+# LB 19
+#
+$CM* $QU $CM* $CAN_CM;                                #   . x QU
+$CM* $QU      $LB18NonBreaks;
+
+
+$CM* $CAN_CM  $CM* $QU;                               #   QU x .
+     $CANT_CM $CM* $QU;
+     
+#
+#  LB 20  Break before and after CB.
+#         nothing needed here.
+#
+
+# LB 20.09 added rule for Finnish tailoring
+$AL ($HY | $HH) / $SP;
+
+# LB 21
+$CM* ($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM];     #  . x (BA | HY | NS)
+
+$CM* [$LB20NonBreaks-$CM] $CM* $BB;                   #  BB x .
+[^$CB] $CM* $BB;                                      # 
+
+# LB21a
+[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL;
+
+# LB21b (reverse)
+$CM* $HL $CM* $SY;
+
+# LB 22
+$CM* $IN $CM* ($ALPlus | $HL);
+$CM* $IN $CM* $ID;
+$CM* $IN $CM* $IN;
+$CM* $IN $CM* $NU;
+
+# LB 23
+$CM* $PO $CM* $ID;
+$CM* $NU $CM* ($ALPlus | $HL);
+$CM* ($ALPlus | $HL) $CM* $NU;
+
+# LB 24
+$CM* $ID $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PO;
+
+
+# LB 25
+($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
+
+# LB 26
+$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
+$CM* ($JT | $JV) $CM* ($H2 | $JV);
+$CM* $JT $CM* ($H3 | $JT);
+
+# LB 27
+$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+
+# LB 28
+$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+
+
+# LB 29
+$CM* ($ALPlus | $HL) $CM* $IS;
+
+# LB 30
+$CM* $OP $CM* ($ALPlus | $HL | $NU);
+$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+
+# LB 30a
+$CM* $RI $CM* $RI;
+
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# LB 9
+$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+$CM+ $SP / .;
+
+# LB 14
+$SP+ $CM* $OP;
+
+# LB 15
+$SP+ $CM* $QU;
+
+# LB 16
+$SP+ $CM* ($CL | $CP);
+
+# LB 17
+$SP+ $CM* $B2;
+
+# LB 21
+$CM* ($HY | $BA | $HH) $CM* $HL;
+
+# LB 25
+($CM* ($IS | $SY))+ $CM* $NU;
+($CL | $CP) $CM* ($NU | $IS | $SY);
+
+# For dictionary-based break
+$dictionary $dictionary;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# Skip forward over all character classes that are involved in
+#   rules containing patterns with possibly more than one char
+#   of context.
+#
+#  It might be slightly more efficient to have specific rules
+#  instead of one generic one, but only if we could
+#  turn off rule chaining.  We don't want to move more
+#  than necessary.
+#
+[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
+$dictionary $dictionary;
+
diff --git a/icuSources/data/brkitr/root.txt b/icuSources/data/brkitr/root.txt

index ce05d5b496b7ce2ed760489ad227fd679b48f2a4..3cfa8014495d26483c64261926f742d8d9c7fba6 100644 (file)
--- a/icuSources/data/brkitr/root.txt
+++ b/icuSources/data/brkitr/root.txt
@@ -11,6 +11,9 @@ root{
      boundaries{
          grapheme:process(dependency){"char.brk"}
          line:process(dependency){"line.brk"}
+        line_loose:process(dependency){"line_loose.brk"}
+        line_normal:process(dependency){"line_normal.brk"}
+        line_strict:process(dependency){"line.brk"}
          sentence:process(dependency){"sent.brk"}
          title:process(dependency){"title.brk"}
          word:process(dependency){"word.brk"}
diff --git a/icuSources/data/brkitr/word.txt b/icuSources/data/brkitr/word.txt

index 06c07df450a5072f5f809b97f618055ec6328f35..8d37f031281131a791761feea325f6bd78536472 100644 (file)
--- a/icuSources/data/brkitr/word.txt
+++ b/icuSources/data/brkitr/word.txt
@@ -1,5 +1,5 @@
  #
-# Copyright (C) 2002-2013, International Business Machines Corporation 
+# Copyright (C) 2002-2015, International Business Machines Corporation 
  # and others. All Rights Reserved.
  #
  # file:  word.txt
@@ -43,48 +43,56 @@ $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
  $Han                = [:Han:];
  $Hiragana           = [:Hiragana:];
  
-$RI_A        = \U0001F1E6;  # Trail ERTU
-$RI_B        = \U0001F1E7;  # Trail EGR
-$RI_C        = \U0001F1E8;  # Trail AHLNZ
+$RI_A        = \U0001F1E6;  # Trail ETU
+$RI_B        = \U0001F1E7;  # Trail ER
+$RI_C        = \U0001F1E8;  # Trail AHLNO
  $RI_D        = \U0001F1E9;  # Trail EK
-$RI_E        = \U0001F1EA;  # Trail GS
+$RI_E        = \U0001F1EA;  # Trail S
  $RI_F        = \U0001F1EB;  # Trail IR
-$RI_G        = \U0001F1EC;  # Trail BR
-$RI_H        = \U0001F1ED;  # Trail KU
-$RI_I        = \U0001F1EE;  # Trail DLNT
+$RI_G        = \U0001F1EC;  # Trail B
+$RI_H        = \U0001F1ED;  # Trail K
+$RI_I        = \U0001F1EE;  # Trail DELNT
  $RI_J        = \U0001F1EF;  # Trail OP
  $RI_K        = \U0001F1F0;  # Trail R
-$RI_L        = \U0001F1F1;  # Trail B
  $RI_M        = \U0001F1F2;  # Trail OXY
-$RI_N        = \U0001F1F3;  # Trail LO
-$RI_P        = \U0001F1F5;  # Trail LT
-$RI_R        = \U0001F1F7;  # Trail OU
-$RI_S        = \U0001F1F8;  # Trail AEGK
-$RI_T        = \U0001F1F9;  # Trail HRW
-$RI_U        = \U0001F1FA;  # Trail AS
+$RI_N        = \U0001F1F3;  # Trail LOZ
+$RI_P        = \U0001F1F5;  # Trail HLRT
+$RI_R        = \U0001F1F7;  # Trail U
+$RI_S        = \U0001F1F8;  # Trail AEG
+$RI_T        = \U0001F1F9;  # Trail R
+$RI_U        = \U0001F1FA;  # Trail S
  $RI_V        = \U0001F1FB;     # Trail N
+$RI_Z        = \U0001F1FF;     # Trail A
  
-$RI_A_End    = [\U0001F1EA \U0001F1F7 \U0001F1F9 \U0001F1FA];          # ERTU
-$RI_B_End    = [\U0001F1EA \U0001F1EC \U0001F1F7];              # EGR
-$RI_C_End    = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1FF]; # AHLNZ
+$RI_A_End    = [\U0001F1EA \U0001F1F9 \U0001F1FA];             # ETU
+$RI_B_End    = [\U0001F1EA \U0001F1F7];                         # ER
+$RI_C_End    = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1F4]; # AHLNO
  $RI_D_End    = [\U0001F1EA \U0001F1F0];                         # EK
-$RI_E_End    = [\U0001F1EC \U0001F1F8];                         # GS
+$RI_E_End    = \U0001F1F8;                                      # S
  $RI_F_End    = [\U0001F1EE \U0001F1F7];                         # IR
-$RI_G_End    = [\U0001F1E7 \U0001F1F7];                         # BR
-$RI_H_End    = [\U0001F1F0 \U0001F1FA];                         # KU
-$RI_I_End    = [\U0001F1E9 \U0001F1F1 \U0001F1F3 \U0001F1F9];  # DLNT
-$RI_J_End    = [\U0001F1F4 \U0001F1F5];                         # OP
+$RI_G_End    = \U0001F1E7;                                      # B
+$RI_H_End    = \U0001F1F0;                                      # K
+$RI_I_End    = [\U0001F1E9 \U0001F1EA \U0001F1F1 \U0001F1F3 \U0001F1F9]; # DELNT
+$RI_J_End    = [\U0001F1F5 \U0001F1F4];                         # OP
  $RI_K_End    = \U0001F1F7;                                      # R
-$RI_L_End    = \U0001F1E7;                                      # B
  $RI_M_End    = [\U0001F1F4 \U0001F1FD \U0001F1FE];              # OXY
-$RI_N_End    = [\U0001F1F1 \U0001F1F4];                         # LO
-$RI_P_End    = [\U0001F1F1 \U0001F1F9];                         # LT
-$RI_R_End    = [\U0001F1F4 \U0001F1FA];                         # OU
-$RI_S_End    = [\U0001F1E6 \U0001F1EA \U0001F1EC \U0001F1F0];   # AEGK
-$RI_T_End    = [\U0001F1ED \U0001F1F7 \U0001F1FC];              # HRW
-$RI_U_End    = [\U0001F1E6 \U0001F1F8];                         # AS
+$RI_N_End    = [\U0001F1F1 \U0001F1F4 \U0001F1FF];              # LOZ
+$RI_P_End    = [\U0001F1ED \U0001F1F1 \U0001F1F7 \U0001F1F9];   # HLRT
+$RI_R_End    = \U0001F1FA;                                      # U
+$RI_S_End    = [\U0001F1E6 \U0001F1EA \U0001F1EC];              # AEG
+$RI_T_End    = \U0001F1F7;                                      # R
+$RI_U_End    = \U0001F1F8;                                      # S
  $RI_V_End    = \U0001F1F3;                                      # N
+$RI_Z_End    = \U0001F1E6;                                      # A
  
+# Special character classes for people & body part emoji:
+# Subsets of $Extend:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of \p{Word_Break = Other}
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
  
  #   Dictionary character set, for triggering language-based break engines. Currently
  #   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
@@ -209,7 +217,6 @@ $RI_H ($Extend|$Format)* $RI_H_End ($Extend|$Format)*;
  $RI_I ($Extend|$Format)* $RI_I_End ($Extend|$Format)*;
  $RI_J ($Extend|$Format)* $RI_J_End ($Extend|$Format)*;
  $RI_K ($Extend|$Format)* $RI_K_End ($Extend|$Format)*;
-$RI_L ($Extend|$Format)* $RI_L_End ($Extend|$Format)*;
  $RI_M ($Extend|$Format)* $RI_M_End ($Extend|$Format)*;
  $RI_N ($Extend|$Format)* $RI_N_End ($Extend|$Format)*;
  $RI_P ($Extend|$Format)* $RI_P_End ($Extend|$Format)*;
@@ -218,6 +225,12 @@ $RI_S ($Extend|$Format)* $RI_S_End ($Extend|$Format)*;
  $RI_T ($Extend|$Format)* $RI_T_End ($Extend|$Format)*;
  $RI_U ($Extend|$Format)* $RI_U_End ($Extend|$Format)*;
  $RI_V ($Extend|$Format)* $RI_V_End ($Extend|$Format)*;
+$RI_Z ($Extend|$Format)* $RI_Z_End ($Extend|$Format)*;
+
+# Special forward rules for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
+$ZWJ $EmojiForSeqs;
+$EmojiForMods $EmojiVar? $EmojiMods;
  
  # special handling for CJK characters: chain for later dictionary segmentation
  $HangulSyllable $HangulSyllable {200};
@@ -298,7 +311,6 @@ $BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $B
  ($Format|$Extend)* $RI_I_End ($Format|$Extend)* $RI_I;
  ($Format|$Extend)* $RI_J_End ($Format|$Extend)* $RI_J;
  ($Format|$Extend)* $RI_K_End ($Format|$Extend)* $RI_K;
-($Format|$Extend)* $RI_L_End ($Format|$Extend)* $RI_L;
  ($Format|$Extend)* $RI_M_End ($Format|$Extend)* $RI_M;
  ($Format|$Extend)* $RI_N_End ($Format|$Extend)* $RI_N;
  ($Format|$Extend)* $RI_P_End ($Format|$Extend)* $RI_P;
@@ -307,6 +319,12 @@ $BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $B
  ($Format|$Extend)* $RI_T_End ($Format|$Extend)* $RI_T;
  ($Format|$Extend)* $RI_U_End ($Format|$Extend)* $RI_U;
  ($Format|$Extend)* $RI_V_End ($Format|$Extend)* $RI_V;
+($Format|$Extend)* $RI_Z_End ($Format|$Extend)* $RI_Z;
+
+# Special reverse rules for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
+$EmojiForSeqs $ZWJ;
+$EmojiMods $EmojiVar? $EmojiForMods;
  
  # special handling for CJK characters: chain for later dictionary segmentation
  $HangulSyllable $HangulSyllable;
diff --git a/icuSources/data/brkitr/word_POSIX.txt b/icuSources/data/brkitr/word_POSIX.txt

index 50ddb812bb3ec85ad61264f66adf610f6cde0f75..fe582b00151fff6a98f653e3ad7d9ba1ab9dd563 100644 (file)
--- a/icuSources/data/brkitr/word_POSIX.txt
+++ b/icuSources/data/brkitr/word_POSIX.txt
@@ -1,5 +1,5 @@
  #
-# Copyright (C) 2002-2013, International Business Machines Corporation 
+# Copyright (C) 2002-2015, International Business Machines Corporation 
  # and others. All Rights Reserved.
  #
  # file:  word_POSIX.txt
@@ -43,48 +43,56 @@ $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
  $Han                = [:Han:];
  $Hiragana           = [:Hiragana:];
  
-$RI_A        = \U0001F1E6;  # Trail ERTU
-$RI_B        = \U0001F1E7;  # Trail EGR
-$RI_C        = \U0001F1E8;  # Trail AHLNZ
+$RI_A        = \U0001F1E6;  # Trail ETU
+$RI_B        = \U0001F1E7;  # Trail ER
+$RI_C        = \U0001F1E8;  # Trail AHLNO
  $RI_D        = \U0001F1E9;  # Trail EK
-$RI_E        = \U0001F1EA;  # Trail GS
+$RI_E        = \U0001F1EA;  # Trail S
  $RI_F        = \U0001F1EB;  # Trail IR
-$RI_G        = \U0001F1EC;  # Trail BR
-$RI_H        = \U0001F1ED;  # Trail KU
-$RI_I        = \U0001F1EE;  # Trail DLNT
+$RI_G        = \U0001F1EC;  # Trail B
+$RI_H        = \U0001F1ED;  # Trail K
+$RI_I        = \U0001F1EE;  # Trail DELNT
  $RI_J        = \U0001F1EF;  # Trail OP
  $RI_K        = \U0001F1F0;  # Trail R
-$RI_L        = \U0001F1F1;  # Trail B
  $RI_M        = \U0001F1F2;  # Trail OXY
-$RI_N        = \U0001F1F3;  # Trail LO
-$RI_P        = \U0001F1F5;  # Trail LT
-$RI_R        = \U0001F1F7;  # Trail OU
-$RI_S        = \U0001F1F8;  # Trail AEGK
-$RI_T        = \U0001F1F9;  # Trail HRW
-$RI_U        = \U0001F1FA;  # Trail AS
+$RI_N        = \U0001F1F3;  # Trail LOZ
+$RI_P        = \U0001F1F5;  # Trail HLRT
+$RI_R        = \U0001F1F7;  # Trail U
+$RI_S        = \U0001F1F8;  # Trail AEG
+$RI_T        = \U0001F1F9;  # Trail R
+$RI_U        = \U0001F1FA;  # Trail S
  $RI_V        = \U0001F1FB;     # Trail N
+$RI_Z        = \U0001F1FF;     # Trail A
  
-$RI_A_End    = [\U0001F1EA \U0001F1F7 \U0001F1F9 \U0001F1FA];          # ERTU
-$RI_B_End    = [\U0001F1EA \U0001F1EC \U0001F1F7];              # EGR
-$RI_C_End    = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1FF]; # AHLNZ
+$RI_A_End    = [\U0001F1EA \U0001F1F9 \U0001F1FA];             # ETU
+$RI_B_End    = [\U0001F1EA \U0001F1F7];                         # ER
+$RI_C_End    = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1F4]; # AHLNO
  $RI_D_End    = [\U0001F1EA \U0001F1F0];                         # EK
-$RI_E_End    = [\U0001F1EC \U0001F1F8];                         # GS
+$RI_E_End    = \U0001F1F8;                                      # S
  $RI_F_End    = [\U0001F1EE \U0001F1F7];                         # IR
-$RI_G_End    = [\U0001F1E7 \U0001F1F7];                         # BR
-$RI_H_End    = [\U0001F1F0 \U0001F1FA];                         # KU
-$RI_I_End    = [\U0001F1E9 \U0001F1F1 \U0001F1F3 \U0001F1F9];  # DLNT
-$RI_J_End    = [\U0001F1F4 \U0001F1F5];                         # OP
+$RI_G_End    = \U0001F1E7;                                      # B
+$RI_H_End    = \U0001F1F0;                                      # K
+$RI_I_End    = [\U0001F1E9 \U0001F1EA \U0001F1F1 \U0001F1F3 \U0001F1F9]; # DELNT
+$RI_J_End    = [\U0001F1F5 \U0001F1F4];                         # OP
  $RI_K_End    = \U0001F1F7;                                      # R
-$RI_L_End    = \U0001F1E7;                                      # B
  $RI_M_End    = [\U0001F1F4 \U0001F1FD \U0001F1FE];              # OXY
-$RI_N_End    = [\U0001F1F1 \U0001F1F4];                         # LO
-$RI_P_End    = [\U0001F1F1 \U0001F1F9];                         # LT
-$RI_R_End    = [\U0001F1F4 \U0001F1FA];                         # OU
-$RI_S_End    = [\U0001F1E6 \U0001F1EA \U0001F1EC \U0001F1F0];   # AEGK
-$RI_T_End    = [\U0001F1ED \U0001F1F7 \U0001F1FC];              # HRW
-$RI_U_End    = [\U0001F1E6 \U0001F1F8];                         # AS
+$RI_N_End    = [\U0001F1F1 \U0001F1F4 \U0001F1FF];              # LOZ
+$RI_P_End    = [\U0001F1ED \U0001F1F1 \U0001F1F7 \U0001F1F9];   # HLRT
+$RI_R_End    = \U0001F1FA;                                      # U
+$RI_S_End    = [\U0001F1E6 \U0001F1EA \U0001F1EC];              # AEG
+$RI_T_End    = \U0001F1F7;                                      # R
+$RI_U_End    = \U0001F1F8;                                      # S
  $RI_V_End    = \U0001F1F3;                                      # N
+$RI_Z_End    = \U0001F1E6;                                      # A
  
+# Special character classes for people & body part emoji:
+# Subsets of $Extend:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of \p{Word_Break = Other}
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
  
  #   Dictionary character set, for triggering language-based break engines. Currently
  #   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
@@ -209,7 +217,6 @@ $RI_H ($Extend|$Format)* $RI_H_End ($Extend|$Format)*;
  $RI_I ($Extend|$Format)* $RI_I_End ($Extend|$Format)*;
  $RI_J ($Extend|$Format)* $RI_J_End ($Extend|$Format)*;
  $RI_K ($Extend|$Format)* $RI_K_End ($Extend|$Format)*;
-$RI_L ($Extend|$Format)* $RI_L_End ($Extend|$Format)*;
  $RI_M ($Extend|$Format)* $RI_M_End ($Extend|$Format)*;
  $RI_N ($Extend|$Format)* $RI_N_End ($Extend|$Format)*;
  $RI_P ($Extend|$Format)* $RI_P_End ($Extend|$Format)*;
@@ -218,6 +225,12 @@ $RI_S ($Extend|$Format)* $RI_S_End ($Extend|$Format)*;
  $RI_T ($Extend|$Format)* $RI_T_End ($Extend|$Format)*;
  $RI_U ($Extend|$Format)* $RI_U_End ($Extend|$Format)*;
  $RI_V ($Extend|$Format)* $RI_V_End ($Extend|$Format)*;
+$RI_Z ($Extend|$Format)* $RI_Z_End ($Extend|$Format)*;
+
+# Special forward rules for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
+$ZWJ $EmojiForSeqs;
+$EmojiForMods $EmojiVar? $EmojiMods;
  
  # special handling for CJK characters: chain for later dictionary segmentation
  $HangulSyllable $HangulSyllable {200};
@@ -298,7 +311,6 @@ $BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $B
  ($Format|$Extend)* $RI_I_End ($Format|$Extend)* $RI_I;
  ($Format|$Extend)* $RI_J_End ($Format|$Extend)* $RI_J;
  ($Format|$Extend)* $RI_K_End ($Format|$Extend)* $RI_K;
-($Format|$Extend)* $RI_L_End ($Format|$Extend)* $RI_L;
  ($Format|$Extend)* $RI_M_End ($Format|$Extend)* $RI_M;
  ($Format|$Extend)* $RI_N_End ($Format|$Extend)* $RI_N;
  ($Format|$Extend)* $RI_P_End ($Format|$Extend)* $RI_P;
@@ -307,6 +319,12 @@ $BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $B
  ($Format|$Extend)* $RI_T_End ($Format|$Extend)* $RI_T;
  ($Format|$Extend)* $RI_U_End ($Format|$Extend)* $RI_U;
  ($Format|$Extend)* $RI_V_End ($Format|$Extend)* $RI_V;
+($Format|$Extend)* $RI_Z_End ($Format|$Extend)* $RI_Z;
+
+# Special reverse rules for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
+$EmojiForSeqs $ZWJ;
+$EmojiMods $EmojiVar? $EmojiForMods;
  
  # special handling for CJK characters: chain for later dictionary segmentation
  $HangulSyllable $HangulSyllable;
diff --git a/icuSources/data/brkitr/zh.txt b/icuSources/data/brkitr/zh.txt

new file mode 100644 (file)

index 0000000..04e9ba3
--- /dev/null
+++ b/icuSources/data/brkitr/zh.txt
@@ -0,0 +1,17 @@
+// ***************************************************************************
+// *
+// * Copyright (C) 2014 International Business Machines
+// * Corporation and others. All Rights Reserved.
+// * Tool: org.unicode.cldr.icu.NewLdml2IcuConverter
+// * Source File: <path>/common/segments/zh.xml ../../xml/brkitr/zh.xml
+// *
+// ***************************************************************************
+zh{
+    Version{"2.0.82.42"}
+    boundaries{
+        line:process(dependency){"line.brk"}
+        line_loose:process(dependency){"line_loose_cj.brk"}
+        line_normal:process(dependency){"line_normal_cj.brk"}
+        line_strict:process(dependency){"line.brk"}
+    }
+}
diff --git a/icuSources/data/brkitr/zh_Hant.txt b/icuSources/data/brkitr/zh_Hant.txt

new file mode 100644 (file)

index 0000000..752278f
--- /dev/null
+++ b/icuSources/data/brkitr/zh_Hant.txt
@@ -0,0 +1,17 @@
+// ***************************************************************************
+// *
+// * Copyright (C) 2014 International Business Machines
+// * Corporation and others. All Rights Reserved.
+// * Tool: org.unicode.cldr.icu.NewLdml2IcuConverter
+// * Source File: <path>/common/segments/zh_Hant.xml ../../xml/brkitr/zh_Hant.xml
+// *
+// ***************************************************************************
+zh_Hant{
+    Version{"2.0.82.42"}
+    boundaries{
+        line:process(dependency){"line.brk"}
+        line_loose:process(dependency){"line_loose_cj.brk"}
+        line_normal:process(dependency){"line_normal_cj.brk"}
+        line_strict:process(dependency){"line.brk"}
+    }
+}
diff --git a/icuSources/data/curr/supplementalData.txt b/icuSources/data/curr/supplementalData.txt

index 7ed00d2e50571407d4c611ad6aa52a6d39da93e2..e36ae4257254387f3ed9051a18b54b9762902496 100644 (file)
--- a/icuSources/data/curr/supplementalData.txt
+++ b/icuSources/data/curr/supplementalData.txt
@@ -1,6 +1,6 @@
  // ***************************************************************************
  // *
-// * Copyright (C) 2014 International Business Machines
+// * Copyright (C) 2015 International Business Machines
  // * Corporation and others. All Rights Reserved.
  // * Tool: org.unicode.cldr.icu.NewLdml2IcuConverter
  // * Source File: <path>/supplementalData.xml
@@ -2459,12 +2459,23 @@ supplementalData:table(nofallback){
              }
          }
          LT{
+            {
+                from:intvector{
+                    330,
+                    -1563774976,
+                }
+                id{"EUR"}
+            }
              {
                  from:intvector{
                      172,
                      -2062942208,
                  }
                  id{"LTL"}
+                to:intvector{
+                    330,
+                    -1563774977,
+                }
              }
              {
                  from:intvector{
diff --git a/icuSources/data/in/ubidi.icu b/icuSources/data/in/ubidi.icu

index 6d898137818e9c586e1db25690255a8ca6ae9c71..9a4888dbe23158f95eb89cce2af9ce3fe45b3c76 100644 (file)

Binary files a/icuSources/data/in/ubidi.icu and b/icuSources/data/in/ubidi.icu differ
diff --git a/icuSources/data/in/unames.icu b/icuSources/data/in/unames.icu

index 2f6fb0cd83e8c7d388f36a7fff9bdbc68d2c587a..a723e493ade81da2eec7176bc2e38eea75794da4 100644 (file)

Binary files a/icuSources/data/in/unames.icu and b/icuSources/data/in/unames.icu differ
diff --git a/icuSources/data/in/uprops.icu b/icuSources/data/in/uprops.icu

index 6dddb506dc9bf6ad49b153b90d3a4a65c890e0d4..f47bd7e5aa7ddd565bd34d2a6dbcad78619cbefc 100644 (file)

Binary files a/icuSources/data/in/uprops.icu and b/icuSources/data/in/uprops.icu differ
diff --git a/icuSources/data/locales/da.txt b/icuSources/data/locales/da.txt

index a82d4fa8f52f4f9b69dd41f3c6ee6fc696c944a2..4ad702216e9d2ed434f04e0b9012aed24624300a 100644 (file)
--- a/icuSources/data/locales/da.txt
+++ b/icuSources/data/locales/da.txt
@@ -1133,8 +1133,8 @@ da{
                  other{"{0} unser"}
              }
              pound{
-                one{"{0} skålpund"}
-                other{"{0} skålpund"}
+                one{"{0} pund"}
+                other{"{0} pund"}
              }
              stone{
                  one{"{0} stone"}
@@ -1348,8 +1348,8 @@ da{
                  other{"{0} unser"}
              }
              pound{
-                one{"{0} pund"}
-                other{"{0} pund"}
+                one{"{0} lb"}
+                other{"{0} lb"}
              }
          }
          power{
@@ -1581,8 +1581,8 @@ da{
                  other{"{0} unser"}
              }
              pound{
-                one{"{0} skålpund"}
-                other{"{0} skålpund"}
+                one{"{0} pund"}
+                other{"{0} pund"}
              }
              stone{
                  one{"{0} st"}
diff --git a/icuSources/data/locales/en.txt b/icuSources/data/locales/en.txt

index ec56cce31dedb21457290b1a15496d7fa88194c4..cfc44f3e5b3b0b8e2015f656ec5a78c90bc114b0 100644 (file)
--- a/icuSources/data/locales/en.txt
+++ b/icuSources/data/locales/en.txt
@@ -1780,7 +1780,7 @@ en{
              }
              minute{
                  one{"{0} min"}
-                other{"{0} mins"}
+                other{"{0} min"}
              }
              month{
                  one{"{0} mth"}
diff --git a/icuSources/data/locales/es.txt b/icuSources/data/locales/es.txt

index c5f58d343a9966a6e47b4b183231c60c0b463463..6316f4b2f17913401ce16e84b2a99aa0b779a46a 100644 (file)
--- a/icuSources/data/locales/es.txt
+++ b/icuSources/data/locales/es.txt
@@ -1043,6 +1043,12 @@ es{
              middle{"{0}, {1}"}
              start{"{0}, {1}"}
          }
+        unit-narrow{
+            2{"{0} {1}"}
+            end{"{0} {1}"}
+            middle{"{0} {1}"}
+            start{"{0} {1}"}
+        }
          unit-short{
              2{"{0} y {1}"}
              end{"{0}, {1}"}
@@ -1352,36 +1358,36 @@ es{
          }
          duration{
              day{
-                one{"{0} d"}
-                other{"{0} d"}
+                one{"{0}d"}
+                other{"{0}d"}
              }
              hour{
-                one{"{0} h"}
-                other{"{0} h"}
+                one{"{0}h"}
+                other{"{0}h"}
              }
              millisecond{
-                one{"{0} ms"}
-                other{"{0} ms"}
+                one{"{0}ms"}
+                other{"{0}ms"}
              }
              minute{
-                one{"{0} min"}
-                other{"{0} min"}
+                one{"{0}min"}
+                other{"{0}min"}
              }
              month{
-                one{"{0} m"}
-                other{"{0} m"}
+                one{"{0}m"}
+                other{"{0}m"}
              }
              second{
-                one{"{0} s"}
-                other{"{0} s"}
+                one{"{0}s"}
+                other{"{0}s"}
              }
              week{
-                one{"{0} semana"}
-                other{"{0} sem"}
+                one{"{0}sem"}
+                other{"{0}sem"}
              }
              year{
                  one{"{0}a"}
-                other{"{0} a"}
+                other{"{0}a"}
              }
          }
          energy{
diff --git a/icuSources/data/locales/es_419.txt b/icuSources/data/locales/es_419.txt

index 7f56fae4145423723649ca8ee93d731ca92fe2d6..3ff34927a01ff7d931d949024b51b366fd4e41a8 100644 (file)
--- a/icuSources/data/locales/es_419.txt
+++ b/icuSources/data/locales/es_419.txt
@@ -98,6 +98,15 @@ es_419{
                          "vie",
                          "sáb",
                      }
+                    narrow{
+                        "D",
+                        "L",
+                        "M",
+                        "M",
+                        "J",
+                        "V",
+                        "S",
+                    }
                      short{
                          "DO",
                          "LU",
diff --git a/icuSources/data/locales/es_MX.txt b/icuSources/data/locales/es_MX.txt

index 0403a508b68f7cbff7c004de27c52cf1b1781b4e..351536f467acecd115ff95273cf32525eac31d03 100644 (file)
--- a/icuSources/data/locales/es_MX.txt
+++ b/icuSources/data/locales/es_MX.txt
@@ -795,10 +795,10 @@ es_MX{
              start{"{0}, {1}"}
          }
          unit-narrow{
-            2{"{0} y {1}"}
-            end{"{0}, {1}"}
-            middle{"{0}, {1}"}
-            start{"{0}, {1}"}
+            2{"{0} {1}"}
+            end{"{0} {1}"}
+            middle{"{0} {1}"}
+            start{"{0} {1}"}
          }
          unit-short{
              2{"{0} y {1}"}
@@ -1043,32 +1043,32 @@ es_MX{
          }
          duration{
              day{
-                one{"{0} d"}
-                other{"{0} d"}
+                one{"{0}d"}
+                other{"{0}d"}
              }
              hour{
-                one{"{0} h"}
-                other{"{0} h"}
+                one{"{0}h"}
+                other{"{0}h"}
              }
              millisecond{
-                one{"{0} ms"}
-                other{"{0} ms"}
+                one{"{0}ms"}
+                other{"{0}ms"}
              }
              minute{
-                one{"{0} min"}
-                other{"{0} min"}
+                one{"{0}min"}
+                other{"{0}min"}
              }
              month{
-                one{"{0} m"}
-                other{"{0} m"}
+                one{"{0}m"}
+                other{"{0}m"}
              }
              week{
-                one{"{0} sem"}
-                other{"{0} sem"}
+                one{"{0}sem"}
+                other{"{0}sem"}
              }
              year{
                  one{"{0}a"}
-                other{"{0} a"}
+                other{"{0}a"}
              }
          }
      }
diff --git a/icuSources/data/locales/fr.txt b/icuSources/data/locales/fr.txt

index e51f356ea84141b8a98eb2a5471ca91557cfdf6b..ed3cb9cc5eb2a7f4901a030e99317881bb627dae 100644 (file)
--- a/icuSources/data/locales/fr.txt
+++ b/icuSources/data/locales/fr.txt
@@ -1943,8 +1943,8 @@ fr{
                  other{"{0} ms"}
              }
              minute{
-                one{"{0} min"}
-                other{"{0} min"}
+                one{"{0} mn"}
+                other{"{0} mn"}
              }
              month{
                  one{"{0} m"}
diff --git a/icuSources/data/locales/hu.txt b/icuSources/data/locales/hu.txt

index 197294786d3467d78134a21e2fa9477b1606787e..7275f27f02c8135b212870e196916308dcf5a3f2 100644 (file)
--- a/icuSources/data/locales/hu.txt
+++ b/icuSources/data/locales/hu.txt
@@ -1085,6 +1085,20 @@ hu{
                      }
                  }
                  stand-alone{
+                    abbreviated{
+                        "Moh.",
+                        "Saf.",
+                        "Rébi I",
+                        "Rébi II",
+                        "Dsem. I",
+                        "Dsem. II",
+                        "Red.",
+                        "Sab.",
+                        "Ram.",
+                        "Sev.",
+                        "Dsül k.",
+                        "Dsül h.",
+                    }
                      narrow{
                          "1",
                          "2",
diff --git a/icuSources/data/locales/it.txt b/icuSources/data/locales/it.txt

index 554a33af4138231cf0cabc074fe7a7dcc330bfe6..0ae03fcd354087c156068c7bfa4ce418860b0684 100644 (file)
--- a/icuSources/data/locales/it.txt
+++ b/icuSources/data/locales/it.txt
@@ -355,10 +355,10 @@ it{
                  Timezone{"{0} {1}"}
              }
              availableFormats{
-                EHm{"E HH.mm"}
+                EHm{"E HH:mm"}
                  EHms{"E HH:mm:ss"}
                  Ed{"E d"}
-                Ehm{"E h.mm a"}
+                Ehm{"E h:mm a"}
                  Ehms{"E h:mm:ss a"}
                  Gy{"y G"}
                  GyMMM{"MMM y G"}
@@ -975,10 +975,10 @@ it{
              start{"{0}, {1}"}
          }
          unit-narrow{
-            2{"{0}, {1}"}
-            end{"{0}, e {1}"}
-            middle{"{0}, {1}"}
-            start{"{0}, {1}"}
+            2{"{0} {1}"}
+            end{"{0} {1}"}
+            middle{"{0} {1}"}
+            start{"{0} {1}"}
          }
          unit-short{
              2{"{0}, {1}"}
@@ -1289,36 +1289,36 @@ it{
          }
          duration{
              day{
-                one{"{0} gg"}
-                other{"{0} gg"}
+                one{"{0}gg"}
+                other{"{0}gg"}
              }
              hour{
-                one{"{0} h"}
-                other{"{0} h"}
+                one{"{0}h"}
+                other{"{0}h"}
              }
              millisecond{
-                one{"{0} ms"}
-                other{"{0} ms"}
+                one{"{0}ms"}
+                other{"{0}ms"}
              }
              minute{
-                one{"{0} m"}
-                other{"{0} m"}
+                one{"{0}min"}
+                other{"{0}min"}
              }
              month{
-                one{"{0} mesi"}
-                other{"{0} mesi"}
+                one{"{0}mesi"}
+                other{"{0}mesi"}
              }
              second{
-                one{"{0} s"}
-                other{"{0} s"}
+                one{"{0}s"}
+                other{"{0}s"}
              }
              week{
-                one{"{0} sett."}
-                other{"{0} sett."}
+                one{"{0}sett."}
+                other{"{0}sett."}
              }
              year{
-                one{"{0} anno"}
-                other{"{0} anni"}
+                one{"{0}anno"}
+                other{"{0}anni"}
              }
          }
          energy{
diff --git a/icuSources/data/locales/it_CH.txt b/icuSources/data/locales/it_CH.txt

index c1bdcebefb1b3b4c7522fb4f42c919fda4eb0622..ea5164b0ebfcbe19244dc47f9d37bdbd890a8afe 100644 (file)
--- a/icuSources/data/locales/it_CH.txt
+++ b/icuSources/data/locales/it_CH.txt
@@ -22,7 +22,7 @@ it_CH{
      calendar{
          generic{
              DateTimePatterns{
-                "HH.mm:ss 'h' zzzz",
+                "HH:mm:ss 'h' zzzz",
                  "HH:mm:ss z",
                  "HH:mm:ss",
                  "HH:mm",
@@ -39,7 +39,7 @@ it_CH{
          }
          gregorian{
              DateTimePatterns{
-                "HH.mm:ss 'h' zzzz",
+                "HH:mm:ss 'h' zzzz",
                  "HH:mm:ss z",
                  "HH:mm:ss",
                  "HH:mm",
diff --git a/icuSources/data/locales/ms.txt b/icuSources/data/locales/ms.txt

index 7d3e5eebbde93992e9cb901d5db170c964e4fcba..2c89fb02304725a8655c6a36df365e0f527879c6 100644 (file)
--- a/icuSources/data/locales/ms.txt
+++ b/icuSources/data/locales/ms.txt
@@ -1291,7 +1291,7 @@ ms{
          }
          length{
              centimeter{
-                other{"{0} sm"}
+                other{"{0} cm"}
              }
              foot{
                  other{"{0}'"}
@@ -1476,7 +1476,7 @@ ms{
          }
          length{
              centimeter{
-                other{"{0} sm"}
+                other{"{0} cm"}
              }
              foot{
                  other{"{0} ka"}
diff --git a/icuSources/data/locales/pl.txt b/icuSources/data/locales/pl.txt

index 0c8753a7f4f86d7aa615e55b314d8ba7e055101c..b7d6f0fbd300e109b207bb590c37fa10615fdffc 100644 (file)
--- a/icuSources/data/locales/pl.txt
+++ b/icuSources/data/locales/pl.txt
@@ -1277,6 +1277,14 @@ pl{
          }
      }
      contextTransforms{
+        day-format-except-narrow:intvector{
+            0,
+            1,
+        }
+        day-standalone-except-narrow:intvector{
+            0,
+            1,
+        }
          month-format-except-narrow:intvector{
              0,
              1,
diff --git a/icuSources/data/locales/sv.txt b/icuSources/data/locales/sv.txt

index 97d74335645c65c61540a3b80d9ce135805edc60..b349d55cd72fd9f7dae71dcc763d912b7108ec94 100644 (file)
--- a/icuSources/data/locales/sv.txt
+++ b/icuSources/data/locales/sv.txt
@@ -1056,6 +1056,20 @@ sv{
              }
              monthNames{
                  format{
+                    abbreviated{
+                        "muharram",
+                        "safar",
+                        "rabi’ al-awwal",
+                        "rabi’ al-akhir",
+                        "jumada-l-ula",
+                        "jumada-l-akhira",
+                        "rajab",
+                        "sha’ban",
+                        "ramadan",
+                        "shawwal",
+                        "dhu-l-ga’da",
+                        "dhu-l-hijja",
+                    }
                      wide{
                          "muharram",
                          "safar",
@@ -1072,6 +1086,20 @@ sv{
                      }
                  }
                  stand-alone{
+                    abbreviated{
+                        "Muharram",
+                        "Safar",
+                        "Rabi’ al-awwal",
+                        "Rabi’ al-akhir",
+                        "Jumada-l-ula",
+                        "Jumada-l-akhira",
+                        "Rajab",
+                        "Sha’ban",
+                        "Ramadan",
+                        "Shawwal",
+                        "Dhu-l-ga’da",
+                        "Dhu-l-hijja",
+                    }
                      wide{
                          "Muharram",
                          "Safar",
diff --git a/icuSources/data/locales/tr.txt b/icuSources/data/locales/tr.txt

index 13b9f0632f3ad33176ebfa1f61dbcd2b1ef00548..1b2c4e5fcbd854659368034017b9ddbac1a29717 100644 (file)
--- a/icuSources/data/locales/tr.txt
+++ b/icuSources/data/locales/tr.txt
@@ -801,6 +801,20 @@ tr{
                      }
                  }
                  stand-alone{
+                    abbreviated{
+                        "Muharrem",
+                        "Safer",
+                        "Rebiülevvel",
+                        "Rebiülahir",
+                        "Cemaziyelevvel",
+                        "Cemaziyelahir",
+                        "Recep",
+                        "Şaban",
+                        "Ramazan",
+                        "Şevval",
+                        "Zilkade",
+                        "Zilhicce",
+                    }
                      narrow{
                          "1",
                          "2",
diff --git a/icuSources/data/locales/zh_Hans_HK.txt b/icuSources/data/locales/zh_Hans_HK.txt

index ba31ca94cd65c08978d6670f59edde6c7b0e0167..82625a66bebec251793f8ba22460aa36fc470bbe 100755 (executable)
--- a/icuSources/data/locales/zh_Hans_HK.txt
+++ b/icuSources/data/locales/zh_Hans_HK.txt
@@ -128,11 +128,11 @@ zh_Hans_HK{
                  "Gy年M月d日",
                  "Gy年M月d日",
                  "d/M/yyGGGGG",
-                "{1}{0}",
-                "{1}{0}",
-                "{1}{0}",
-                "{1}{0}",
-                "{1}{0}",
+                "{1} {0}",
+                "{1} {0}",
+                "{1} {0}",
+                "{1} {0}",
+                "{1} {0}",
              }
              availableFormats{
                  HHmm{"HH:mm"}
@@ -189,8 +189,8 @@ zh_Hans_HK{
                  "y年M月d日",
                  "d/M/yy",
                  "{1} {0}",
-                "{1}{0}",
-                "{1}{0}",
+                "{1} {0}",
+                "{1} {0}",
                  "{1} {0}",
                  "{1} {0}",
              }
diff --git a/icuSources/data/locales/zh_Hans_MO.txt b/icuSources/data/locales/zh_Hans_MO.txt

index 6d5f431a0577340ea1138a574f4de366a6f69617..b4257f63de965471149c45c515492b6153aede56 100755 (executable)
--- a/icuSources/data/locales/zh_Hans_MO.txt
+++ b/icuSources/data/locales/zh_Hans_MO.txt
@@ -91,8 +91,8 @@ zh_Hans_MO{
                  "y年M月d日",
                  "d/M/yy",
                  "{1} {0}",
-                "{1}{0}",
-                "{1}{0}",
+                "{1} {0}",
+                "{1} {0}",
                  "{1} {0}",
                  "{1} {0}",
              }
diff --git a/icuSources/data/locales/zh_Hant.txt b/icuSources/data/locales/zh_Hant.txt

index 3e22ca008299c433b15ce22d1b8abfc907450b17..4a1e971b85a0d1b31f079a2e256a5a159b2ad8da 100644 (file)
--- a/icuSources/data/locales/zh_Hant.txt
+++ b/icuSources/data/locales/zh_Hant.txt
@@ -2273,16 +2273,22 @@ zh_Hant{
              start{"{0}、{1}"}
          }
          unit{
+            2{"{0} {1}"}
+            end{"{0} {1}"}
+            middle{"{0} {1}"}
+            start{"{0} {1}"}
+        }
+        unit-narrow{
              2{"{0}{1}"}
              end{"{0}{1}"}
              middle{"{0}{1}"}
              start{"{0}{1}"}
          }
          unit-short{
-            2{"{0}{1}"}
-            end{"{0}{1}"}
-            middle{"{0}{1}"}
-            start{"{0}{1}"}
+            2{"{0} {1}"}
+            end{"{0} {1}"}
+            middle{"{0} {1}"}
+            start{"{0} {1}"}
          }
      }
      measurementSystemNames{
@@ -2529,28 +2535,28 @@ zh_Hant{
          }
          duration{
              day{
-                other{"{0} 天"}
+                other{"{0}天"}
              }
              hour{
-                other{"{0} 小時"}
+                other{"{0}時"}
              }
              millisecond{
-                other{"{0} 毫秒"}
+                other{"{0}毫秒"}
              }
              minute{
-                other{"{0} 分鐘"}
+                other{"{0}分"}
              }
              month{
-                other{"{0} 個月"}
+                other{"{0}個月"}
              }
              second{
-                other{"{0} 秒"}
+                other{"{0}秒"}
              }
              week{
-                other{"{0} 週"}
+                other{"{0}週"}
              }
              year{
-                other{"{0} 年"}
+                other{"{0}年"}
              }
          }
          energy{
diff --git a/icuSources/data/locales/zh_Hant_HK.txt b/icuSources/data/locales/zh_Hant_HK.txt

index bbca9d7207692747c8f7e25fd951a1ffc6675c3a..10b3b01b773bae2d7329922e9c2bed57346d6fcf 100644 (file)
--- a/icuSources/data/locales/zh_Hant_HK.txt
+++ b/icuSources/data/locales/zh_Hant_HK.txt
@@ -125,11 +125,11 @@ zh_Hant_HK{
                  "Gy年M月d日",
                  "Gy年M月d日",
                  "d/M/yGGGGG",
-                "{1}{0}",
                  "{1} {0}",
                  "{1} {0}",
-                "{1}{0}",
-                "{1}{0}",
+                "{1} {0}",
+                "{1} {0}",
+                "{1} {0}",
              }
              availableFormats{
                  Ed{"d E"}
@@ -199,11 +199,11 @@ zh_Hant_HK{
                  "y年M月d日",
                  "y年M月d日",
                  "d/M/y",
-                "{1}{0}",
                  "{1} {0}",
                  "{1} {0}",
-                "{1}{0}",
-                "{1}{0}",
+                "{1} {0}",
+                "{1} {0}",
+                "{1} {0}",
              }
              availableFormats{
                  Ed{"d E"}
@@ -358,7 +358,7 @@ zh_Hant_HK{
          }
          duration{
              week{
-                other{"{0}星期"}
+                other{"{0} 星期"}
              }
          }
          energy{
diff --git a/icuSources/data/locales/zh_Hant_MO.txt b/icuSources/data/locales/zh_Hant_MO.txt

index 5c22991848333b600e511368d7544858ac9e32b2..a3db6ba8faf3881da5f94c3b36919992321072dd 100644 (file)
--- a/icuSources/data/locales/zh_Hant_MO.txt
+++ b/icuSources/data/locales/zh_Hant_MO.txt
@@ -20,11 +20,11 @@ zh_Hant_MO{
                  "Gy年M月d日",
                  "Gy年M月d日",
                  "d/M/yGGGGG",
-                "{1}{0}",
                  "{1} {0}",
                  "{1} {0}",
-                "{1}{0}",
-                "{1}{0}",
+                "{1} {0}",
+                "{1} {0}",
+                "{1} {0}",
              }
              availableFormats{
                  Ed{"d E"}
@@ -52,11 +52,11 @@ zh_Hant_MO{
                  "y年M月d日",
                  "y年M月d日",
                  "d/M/y",
-                "{1}{0}",
                  "{1} {0}",
                  "{1} {0}",
-                "{1}{0}",
-                "{1}{0}",
+                "{1} {0}",
+                "{1} {0}",
+                "{1} {0}",
              }
              availableFormats{
                  Ed{"d E"}
diff --git a/icuSources/data/misc/metaZones.txt b/icuSources/data/misc/metaZones.txt

index 939d924162d9b02074640cbfb54f3dde1dbfc576..a65629f9919b63d813627148be6a4fd44c83d91e 100644 (file)
--- a/icuSources/data/misc/metaZones.txt
+++ b/icuSources/data/misc/metaZones.txt
@@ -1,6 +1,6 @@
  // ***************************************************************************
  // *
-// * Copyright (C) 2014 International Business Machines
+// * Copyright (C) 2015 International Business Machines
  // * Corporation and others. All Rights Reserved.
  // * Tool: org.unicode.cldr.icu.NewLdml2IcuConverter
  // * Source File: <path>/metaZones.xml
@@ -1412,6 +1412,11 @@ metaZones:table(nofallback){
              {
                  "America_Central",
                  "1998-08-02 06:00",
+                "2015-02-01 08:00",
+            }
+            {
+                "America_Eastern",
+                "2015-02-01 08:00",
                  "9999-12-31 23:59",
              }
          }
@@ -2874,13 +2879,6 @@ metaZones:table(nofallback){
          "Asia:Kamchatka"{
              {
                  "Kamchatka",
-                "1970-01-01 00:00",
-                "2010-03-27 14:00",
-            }
-            {
-                "Magadan",
-                "2010-03-27 14:00",
-                "9999-12-31 23:59",
              }
          }
          "Asia:Karachi"{
@@ -3740,11 +3738,6 @@ metaZones:table(nofallback){
              {
                  "Samara",
                  "1991-10-20 00:00",
-                "2010-03-27 22:00",
-            }
-            {
-                "Moscow",
-                "2010-03-27 22:00",
                  "9999-12-31 23:59",
              }
          }
diff --git a/icuSources/data/misc/windowsZones.txt b/icuSources/data/misc/windowsZones.txt

index 4a9b9f211e4f1a9fd2fc81c09d32f0d00bc6714a..8706eac83d885e6d40de11159b3c312310a6fcb5 100644 (file)
--- a/icuSources/data/misc/windowsZones.txt
+++ b/icuSources/data/misc/windowsZones.txt
@@ -1,6 +1,6 @@
  // ***************************************************************************
  // *
-// * Copyright (C) 2014 International Business Machines
+// * Copyright (C) 2015 International Business Machines
  // * Corporation and others. All Rights Reserved.
  // * Tool: org.unicode.cldr.icu.NewLdml2IcuConverter
  // * Source File: <path>/windowsZones.xml
@@ -145,6 +145,7 @@ windowsZones:table(nofallback){
              AU{"Antarctica/Macquarie"}
              FM{"Pacific/Ponape Pacific/Kosrae"}
              NC{"Pacific/Noumea"}
+            PG{"Pacific/Bougainville"}
              SB{"Pacific/Guadalcanal"}
              VU{"Pacific/Efate"}
              ZZ{"Etc/GMT-11"}
@@ -152,8 +153,8 @@ windowsZones:table(nofallback){
          "Central Standard Time (Mexico)"{
              001{"America/Mexico_City"}
              MX{
-                "America/Mexico_City America/Bahia_Banderas America/Cancun America/Me"
-                "rida America/Monterrey"
+                "America/Mexico_City America/Bahia_Banderas America/Merida America/Mo"
+                "nterrey"
              }
          }
          "Central Standard Time"{
@@ -467,6 +468,7 @@ windowsZones:table(nofallback){
              EC{"America/Guayaquil"}
              JM{"America/Jamaica"}
              KY{"America/Cayman"}
+            MX{"America/Cancun"}
              PA{"America/Panama"}
              PE{"America/Lima"}
              ZZ{"Etc/GMT+5"}
@@ -686,7 +688,7 @@ windowsZones:table(nofallback){
              FM{"Pacific/Truk"}
              GU{"Pacific/Guam"}
              MP{"Pacific/Saipan"}
-            PG{"Pacific/Port_Moresby Pacific/Bougainville"}
+            PG{"Pacific/Port_Moresby"}
              ZZ{"Etc/GMT-10"}
          }
          "Yakutsk Standard Time"{
diff --git a/icuSources/data/misc/zoneinfo64.txt b/icuSources/data/misc/zoneinfo64.txt

index e619ccba5eacd7f7506f2fba761bbbd28ee6f1f9..c131c73838ccc40c62a4b9ce2a28fe31247d9582 100644 (file)
--- a/icuSources/data/misc/zoneinfo64.txt
+++ b/icuSources/data/misc/zoneinfo64.txt
@@ -1,11 +1,11 @@
  //---------------------------------------------------------
-// Copyright (C) 2003-2014, International Business Machines
+// Copyright (C) 2003-2015, International Business Machines
  // Corporation and others.  All Rights Reserved.
  //---------------------------------------------------------
  // Build tool:  tz2icu
-// Build date:  Tue Nov 11 12:33:07 2014
+// Build date:  Tue Feb  3 16:54:37 2015
  // tz database: ftp://ftp.iana.org/tz/
-// tz version:  2014j
+// tz version:  2015a
  // ICU version: 55.0.1
  //---------------------------------------------------------
  // >> !!! >>   THIS IS A MACHINE-GENERATED FILE   << !!! <<
@@ -13,7 +13,7 @@
  //---------------------------------------------------------
  
  zoneinfo64:table(nofallback) {
- TZVersion { "2014j" }
+ TZVersion { "2015a" }
   Zones:array { 
    /* ACT */ :int { 347 } //Z#0
    /* AET */ :int { 359 } //Z#1
@@ -374,12 +374,9 @@ zoneinfo64:table(nofallback) {
      finalYear:int { 2039 }
    } //Z#92
    /* America/Cancun */ :table {
-    trans:intvector { -1514743200, 377935200, 828860400, 846396000, 860310000, 877845600, 891759600, 902037600, 909298800, 923212800, 941353200, 954662400, 972802800, 989136000, 1001833200, 1018166400, 1035702000 }
+    trans:intvector { -1514743200, 377935200, 828860400, 846396000, 860310000, 877845600, 891759600, 902037600, 909298800, 923212800, 941353200, 954662400, 972802800, 989136000, 1001833200, 1018166400, 1035702000, 1049616000, 1067151600, 1081065600, 1099206000, 1112515200, 1130655600, 1143964800, 1162105200, 1175414400, 1193554800, 1207468800, 1225004400, 1238918400, 1256454000, 1270368000, 1288508400, 1301817600, 1319958000, 1333267200, 1351407600, 1365321600, 1382857200, 1396771200, 1414306800, 1422777600 }
      typeOffsets:intvector { -20824, 0, -21600, 0, -21600, 3600, -18000, 0, -18000, 3600 }
-    typeMap:bin { "0103040304030402010201020102010201" }
-    finalRule { "Mexico" }
-    finalRaw:int { -21600 }
-    finalYear:int { 2003 }
+    typeMap:bin { "010304030403040201020102010201020102010201020102010201020102010201020102010201020103" }
    } //Z#93
    /* America/Caracas */ :table {
      transPre32:intvector { -1, 1770461760 }
@@ -1039,12 +1036,9 @@ zoneinfo64:table(nofallback) {
      typeMap:bin { "010201020102010201020102010201020102010201020102010201020103" }
    } //Z#197
    /* America/Santiago */ :table {
-    trans:intvector { -1893439034, -1688410800, -1619983034, -1593806400, -1335986234, -1317585600, -1304362800, -1286049600, -1272826800, -1254513600, -1241290800, -1222977600, -1209754800, -1191355200, -1178132400, -870552000, -865278000, -736376400, -713646000, -36619200, -23922000, -3355200, 7527600, 24465600, 37767600, 55915200, 69217200, 87969600, 100666800, 118209600, 132116400, 150868800, 163566000, 182318400, 195620400, 213768000, 227070000, 245217600, 258519600, 277272000, 289969200, 308721600, 321418800, 340171200, 353473200, 371620800, 384922800, 403070400, 416372400, 434520000, 447822000, 466574400, 479271600, 498024000, 510721200, 529473600, 545194800, 560923200, 574225200, 591768000, 605674800, 624427200, 637729200, 653457600, 668574000, 687326400, 700628400, 718776000, 732078000, 750225600, 763527600, 781675200, 794977200, 813729600, 826426800, 845179200, 859690800, 876628800, 889930800, 906868800, 923194800, 939528000, 952830000, 971582400, 984279600, 1003032000, 1015729200, 1034481600, 1047178800, 1065931200, 1079233200, 1097380800, 1110682800, 1128830400, 1142132400, 1160884800, 1173582000, 1192334400, 1206846000, 1223784000, 1237086000, 1255233600, 1270350000, 1286683200, 1304823600, 1313899200, 1335668400, 1346558400 }
-    typeOffsets:intvector { -16966, 0, -18000, 0, -18000, 3600, -14400, 0, -14400, 3600 }
-    typeMap:bin { "010003000201020102010201020102010201030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304" }
-    finalRule { "Chile" }
-    finalRaw:int { -14400 }
-    finalYear:int { 2013 }
+    trans:intvector { -1893439034, -1688410800, -1619983034, -1593806400, -1335986234, -1317585600, -1304362800, -1286049600, -1272826800, -1254513600, -1241290800, -1222977600, -1209754800, -1191355200, -1178132400, -870552000, -865278000, -736376400, -713646000, -36619200, -23922000, -3355200, 7527600, 24465600, 37767600, 55915200, 69217200, 87969600, 100666800, 118209600, 132116400, 150868800, 163566000, 182318400, 195620400, 213768000, 227070000, 245217600, 258519600, 277272000, 289969200, 308721600, 321418800, 340171200, 353473200, 371620800, 384922800, 403070400, 416372400, 434520000, 447822000, 466574400, 479271600, 498024000, 510721200, 529473600, 545194800, 560923200, 574225200, 591768000, 605674800, 624427200, 637729200, 653457600, 668574000, 687326400, 700628400, 718776000, 732078000, 750225600, 763527600, 781675200, 794977200, 813729600, 826426800, 845179200, 859690800, 876628800, 889930800, 906868800, 923194800, 939528000, 952830000, 971582400, 984279600, 1003032000, 1015729200, 1034481600, 1047178800, 1065931200, 1079233200, 1097380800, 1110682800, 1128830400, 1142132400, 1160884800, 1173582000, 1192334400, 1206846000, 1223784000, 1237086000, 1255233600, 1270350000, 1286683200, 1304823600, 1313899200, 1335668400, 1346558400, 1367118000, 1378612800, 1398567600, 1410062400, 1430017200 }
+    typeOffsets:intvector { -16966, 0, -18000, 0, -18000, 3600, -14400, 0, -14400, 3600, -10800, 0 }
+    typeMap:bin { "0100030002010201020102010201020102010304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030405" }
      links:intvector { 198, 385 }
    } //Z#198
    /* America/Santo_Domingo */ :table {
@@ -1218,12 +1212,9 @@ zoneinfo64:table(nofallback) {
    } //Z#227
    /* Antarctica/McMurdo */ :int { 540 } //Z#228
    /* Antarctica/Palmer */ :table {
-    trans:intvector { -157766400, -152658000, -132955200, -121122000, -101419200, -86821200, -71092800, -54766800, -39038400, -23317200, -7588800, 128142000, 136605600, 389070000, 403070400, 416372400, 434520000, 447822000, 466574400, 479271600, 498024000, 510721200, 529473600, 545194800, 560923200, 574225200, 591768000, 605674800, 624427200, 637729200, 653457600, 668574000, 687326400, 700628400, 718776000, 732078000, 750225600, 763527600, 781675200, 794977200, 813729600, 826426800, 845179200, 859690800, 876628800, 889930800, 906868800, 923194800, 939528000, 952830000, 971582400, 984279600, 1003032000, 1015729200, 1034481600, 1047178800, 1065931200, 1079233200, 1097380800, 1110682800, 1128830400, 1142132400, 1160884800, 1173582000, 1192334400, 1206846000, 1223784000, 1237086000, 1255233600, 1270350000, 1286683200, 1304823600, 1313899200, 1335668400, 1346558400 }
+    trans:intvector { -157766400, -152658000, -132955200, -121122000, -101419200, -86821200, -71092800, -54766800, -39038400, -23317200, -7588800, 128142000, 136605600, 389070000, 403070400, 416372400, 434520000, 447822000, 466574400, 479271600, 498024000, 510721200, 529473600, 545194800, 560923200, 574225200, 591768000, 605674800, 624427200, 637729200, 653457600, 668574000, 687326400, 700628400, 718776000, 732078000, 750225600, 763527600, 781675200, 794977200, 813729600, 826426800, 845179200, 859690800, 876628800, 889930800, 906868800, 923194800, 939528000, 952830000, 971582400, 984279600, 1003032000, 1015729200, 1034481600, 1047178800, 1065931200, 1079233200, 1097380800, 1110682800, 1128830400, 1142132400, 1160884800, 1173582000, 1192334400, 1206846000, 1223784000, 1237086000, 1255233600, 1270350000, 1286683200, 1304823600, 1313899200, 1335668400, 1346558400, 1367118000, 1378612800, 1398567600, 1410062400, 1430017200 }
      typeOffsets:intvector { 0, 0, -14400, 0, -14400, 3600, -10800, 0, -10800, 3600 }
-    typeMap:bin { "020102010201020102010304030102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102" }
-    finalRule { "ChileAQ" }
-    finalRaw:int { -14400 }
-    finalYear:int { 2013 }
+    typeMap:bin { "0201020102010201020103040301020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010203" }
    } //Z#229
    /* Antarctica/Rothera */ :table {
      trans:intvector { 218246400 }
@@ -1250,11 +1241,7 @@ zoneinfo64:table(nofallback) {
      typeMap:bin { "01" }
    } //Z#234
    /* Arctic/Longyearbyen */ :int { 464 } //Z#235
-  /* Asia/Aden */ :table {
-    trans:intvector { -631162794 }
-    typeOffsets:intvector { 10794, 0, 10800, 0 }
-    typeMap:bin { "01" }
-  } //Z#236
+  /* Asia/Aden */ :int { 303 } //Z#236
    /* Asia/Almaty */ :table {
      trans:intvector { -1441170468, -1247547600, 354909600, 370717200, 386445600, 402253200, 417981600, 433789200, 449604000, 465336000, 481060800, 496785600, 512510400, 528235200, 543960000, 559684800, 575409600, 591134400, 606859200, 622584000, 638308800, 654638400, 701802000, 717523200, 733262400, 748987200, 764712000, 780436800, 796161600, 811886400, 828216000, 846360000, 859665600, 877809600, 891115200, 909259200, 922564800, 941313600, 954014400, 972763200, 985464000, 1004212800, 1017518400, 1035662400, 1048968000, 1067112000, 1080417600, 1099166400 }
      typeOffsets:intvector { 18468, 0, 18000, 0, 21600, 0, 21600, 3600 }
@@ -1296,11 +1283,7 @@ zoneinfo64:table(nofallback) {
      typeOffsets:intvector { 10660, 0, 10656, 0, 10800, 0, 10800, 3600 }
      typeMap:bin { "010203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302" }
    } //Z#244
-  /* Asia/Bahrain */ :table {
-    trans:intvector { -1577935340, 76190400 }
-    typeOffsets:intvector { 12140, 0, 10800, 0, 14400, 0 }
-    typeMap:bin { "0201" }
-  } //Z#245
+  /* Asia/Bahrain */ :int { 300 } //Z#245
    /* Asia/Baku */ :table {
      trans:intvector { -1441163964, -405140400, 354916800, 370724400, 386452800, 402260400, 417988800, 433796400, 449611200, 465343200, 481068000, 496792800, 512517600, 528242400, 543967200, 559692000, 575416800, 591141600, 606866400, 622591200, 638316000, 654645600, 670370400, 686098800, 701812800, 717534000, 828234000, 846378000, 859680000, 877824000 }
      typeOffsets:intvector { 11964, 0, 10800, 0, 10800, 3600, 14400, 0, 14400, 3600 }
@@ -1378,6 +1361,7 @@ zoneinfo64:table(nofallback) {
      trans:intvector { -1577936472 }
      typeOffsets:intvector { 13272, 0, 14400, 0 }
      typeMap:bin { "01" }
+    links:intvector { 261, 291 }
    } //Z#261
    /* Asia/Dushanbe */ :table {
      trans:intvector { -1441168512, -1247547600, 354909600, 370717200, 386445600, 402253200, 417981600, 433789200, 449604000, 465336000, 481060800, 496785600, 512510400, 528235200, 543960000, 559684800, 575409600, 591134400, 606859200, 622584000, 638308800, 654638400, 670363200, 684363600 }
@@ -1499,11 +1483,7 @@ zoneinfo64:table(nofallback) {
      typeOffsets:intvector { 26480, 0, 27000, 0, 28800, 0, 28800, 1200, 32400, 0 }
      typeMap:bin { "010203020302030203020302030203020402" }
    } //Z#284
-  /* Asia/Kuwait */ :table {
-    trans:intvector { -631163516 }
-    typeOffsets:intvector { 11516, 0, 10800, 0 }
-    typeMap:bin { "01" }
-  } //Z#285
+  /* Asia/Kuwait */ :int { 303 } //Z#285
    /* Asia/Macao */ :int { 287 } //Z#286
    /* Asia/Macau */ :table {
      trans:intvector { -1830411260, -277360200, -257405400, -245910600, -225955800, -214473600, -194506200, -182406600, -163056600, -150969600, -131619600, -117088200, -101367000, -85638600, -69312600, -53584200, -37863000, -22134600, -6413400, 9315000, 25036200, 40764600, 56485800, 72201600, 87922800, 103651200, 119977200, 135705600, 151439400, 167167800, 182889000, 198617400, 214338600, 230067000, 245788200, 261504000, 277225200, 292953600, 309279600, 325008000, 340729200 }
@@ -1528,11 +1508,7 @@ zoneinfo64:table(nofallback) {
      typeOffsets:intvector { -57360, 0, 28800, 0, 28800, 3600, 29040, 0, 32400, 0 }
      typeMap:bin { "03010201040102010201" }
    } //Z#290
-  /* Asia/Muscat */ :table {
-    trans:intvector { -1577937264 }
-    typeOffsets:intvector { 14064, 0, 14400, 0 }
-    typeMap:bin { "01" }
-  } //Z#291
+  /* Asia/Muscat */ :int { 261 } //Z#291
    /* Asia/Nicosia */ :table {
      trans:intvector { -1518920008, 166572000, 182293200, 200959200, 213829200, 228866400, 243982800, 260316000, 276123600, 291765600, 307486800, 323820000, 338936400, 354664800, 370386000, 386114400, 401835600, 417564000, 433285200, 449013600, 465339600, 481068000, 496789200, 512517600, 528238800, 543967200, 559688400, 575416800, 591138000, 606866400, 622587600, 638316000, 654642000, 670370400, 686091600, 701820000, 717541200, 733269600, 748990800, 764719200, 780440400, 796168800, 811890000, 828223200, 843944400, 859672800, 875394000, 891122400, 909277200, 922582800, 941331600 }
      typeOffsets:intvector { 8008, 0, 7200, 0, 7200, 3600 }
@@ -1577,6 +1553,7 @@ zoneinfo64:table(nofallback) {
      trans:intvector { -1577935568, 76190400 }
      typeOffsets:intvector { 12368, 0, 10800, 0, 14400, 0 }
      typeMap:bin { "0201" }
+    links:intvector { 245, 300 }
    } //Z#300
    /* Asia/Qyzylorda */ :table {
      trans:intvector { -1441167712, -1247544000, 354913200, 370720800, 386445600, 402256800, 417985200, 433792800, 449607600, 465339600, 481064400, 496789200, 512514000, 528238800, 543963600, 559688400, 575413200, 591138000, 606862800, 622587600, 638312400, 654642000, 695768400, 701802000, 717523200, 733262400, 748987200, 764712000, 780436800, 796161600, 811886400, 828216000, 846360000, 859665600, 877809600, 891115200, 909259200, 922564800, 941313600, 954014400, 972763200, 985464000, 1004212800, 1017518400, 1035662400, 1048968000, 1067112000, 1080417600, 1099166400 }
@@ -1592,6 +1569,7 @@ zoneinfo64:table(nofallback) {
      trans:intvector { -719636812 }
      typeOffsets:intvector { 11212, 0, 10800, 0 }
      typeMap:bin { "01" }
+    links:intvector { 236, 285, 303 }
    } //Z#303
    /* Asia/Saigon */ :int { 266 } //Z#304
    /* Asia/Sakhalin */ :table {
@@ -1757,10 +1735,9 @@ zoneinfo64:table(nofallback) {
      finalYear:int { 1997 }
    } //Z#336
    /* Atlantic/Reykjavik */ :table {
-    transPre32:intvector { -1, 97919740 }
-    trans:intvector { -1956609132, -1668211200, -1647212400, -1636675200, -1613430000, -968025600, -949615200, -942008400, -920239200, -909957600, -888789600, -877903200, -857944800, -846453600, -826495200, -815004000, -795045600, -783554400, -762991200, -752104800, -731541600, -717631200, -700092000, -686181600, -668642400, -654732000, -636588000, -623282400, -605743200, -591832800, -573688800, -559778400, -542239200, -528328800, -510789600, -496879200, -479340000, -465429600, -447890400, -433980000, -415836000, -401925600, -384386400, -370476000, -352936800, -339026400, -321487200, -307576800, -290037600, -276127200, -258588000, -244677600, -226533600, -212623200, -195084000, -181173600, -163634400, -149724000, -132184800, -118274400, -100735200, -86824800, -68680800, -54770400 }
-    typeOffsets:intvector { -5244, 0, -5268, 0, -3600, 0, -3600, 3600, 0, 0 }
-    typeMap:bin { "0102030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030204" }
+    trans:intvector { -1956609120, -1668211200, -1647212400, -1636675200, -1613430000, -1605139200, -1581894000, -1539561600, -1531350000, -968025600, -952293600, -942008400, -920239200, -909957600, -888789600, -877903200, -857944800, -846453600, -826495200, -815004000, -795045600, -783554400, -762991200, -752104800, -731541600, -717631200, -700092000, -686181600, -668642400, -654732000, -636588000, -623282400, -605743200, -591832800, -573688800, -559778400, -542239200, -528328800, -510789600, -496879200, -479340000, -465429600, -447890400, -433980000, -415836000, -401925600, -384386400, -370476000, -352936800, -339026400, -321487200, -307576800, -290037600, -276127200, -258588000, -244677600, -226533600, -212623200, -195084000, -181173600, -163634400, -149724000, -132184800, -118274400, -100735200, -86824800, -68680800, -54770400 }
+    typeOffsets:intvector { -5280, 0, -3600, 0, -3600, 3600, 0, 0 }
+    typeMap:bin { "0102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020103" }
      links:intvector { 337, 502 }
    } //Z#337
    /* Atlantic/South_Georgia */ :table {
@@ -2575,12 +2552,9 @@ zoneinfo64:table(nofallback) {
    } //Z#543
    /* Pacific/Easter */ :table {
      transPre32:intvector { -1, 1770471960 }
-    trans:intvector { -1178124152, -870552000, -865278000, -736376400, -36619200, -23922000, -3355200, 7527600, 24465600, 37767600, 55915200, 69217200, 87969600, 100666800, 118209600, 132116400, 150868800, 163566000, 182318400, 195620400, 213768000, 227070000, 245217600, 258519600, 277272000, 289969200, 308721600, 321418800, 340171200, 353473200, 371620800, 384922800, 403070400, 416372400, 434520000, 447822000, 466574400, 479271600, 498024000, 510721200, 529473600, 545194800, 560923200, 574225200, 591768000, 605674800, 624427200, 637729200, 653457600, 668574000, 687326400, 700628400, 718776000, 732078000, 750225600, 763527600, 781675200, 794977200, 813729600, 826426800, 845179200, 859690800, 876628800, 889930800, 906868800, 923194800, 939528000, 952830000, 971582400, 984279600, 1003032000, 1015729200, 1034481600, 1047178800, 1065931200, 1079233200, 1097380800, 1110682800, 1128830400, 1142132400, 1160884800, 1173582000, 1192334400, 1206846000, 1223784000, 1237086000, 1255233600, 1270350000, 1286683200, 1304823600, 1313899200, 1335668400, 1346558400 }
-    typeOffsets:intvector { -26264, 0, -26248, 0, -25200, 0, -25200, 3600, -21600, 0, -21600, 3600 }
-    typeMap:bin { "01030203020302030203020302030203020302030203020302030203020302030405040504050405040504050405040504050405040504050405040504050405040504050405040504050405040504050405040504050405040504050405" }
-    finalRule { "Chile" }
-    finalRaw:int { -21600 }
-    finalYear:int { 2013 }
+    trans:intvector { -1178124152, -870552000, -865278000, -736376400, -36619200, -23922000, -3355200, 7527600, 24465600, 37767600, 55915200, 69217200, 87969600, 100666800, 118209600, 132116400, 150868800, 163566000, 182318400, 195620400, 213768000, 227070000, 245217600, 258519600, 277272000, 289969200, 308721600, 321418800, 340171200, 353473200, 371620800, 384836400, 384922800, 403070400, 416372400, 434520000, 447822000, 466574400, 479271600, 498024000, 510721200, 529473600, 545194800, 560923200, 574225200, 591768000, 605674800, 624427200, 637729200, 653457600, 668574000, 687326400, 700628400, 718776000, 732078000, 750225600, 763527600, 781675200, 794977200, 813729600, 826426800, 845179200, 859690800, 876628800, 889930800, 906868800, 923194800, 939528000, 952830000, 971582400, 984279600, 1003032000, 1015729200, 1034481600, 1047178800, 1065931200, 1079233200, 1097380800, 1110682800, 1128830400, 1142132400, 1160884800, 1173582000, 1192334400, 1206846000, 1223784000, 1237086000, 1255233600, 1270350000, 1286683200, 1304823600, 1313899200, 1335668400, 1346558400, 1367118000, 1378612800, 1398567600, 1410062400, 1430017200 }
+    typeOffsets:intvector { -26264, 0, -26248, 0, -25200, 0, -25200, 3600, -21600, 0, -21600, 3600, -18000, 0 }
+    typeMap:bin { "01030203020302030203020302030203020302030203020302030203020302030504050405040504050405040504050405040504050405040504050405040504050405040504050405040504050405040504050405040504050405040504050405040506" }
      links:intvector { 386, 544 }
    } //Z#544
    /* Pacific/Efate */ :table {
@@ -3092,81 +3066,75 @@ zoneinfo64:table(nofallback) {
    Chatham:intvector {
      8, -30, -1, 9900, 1, 3, 1, -1, 9900, 1, 3600
    } //_#8
-  Chile:intvector {
-    8, 2, -1, 14400, 2, 3, 23, -1, 10800, 2, 3600
-  } //_#9
-  ChileAQ:intvector {
-    8, 2, -1, 14400, 2, 3, 23, -1, 10800, 2, 3600
-  } //_#10
    Cuba:intvector {
      2, 8, -1, 0, 1, 10, 1, -1, 0, 1, 3600
-  } //_#11
+  } //_#9
    EU:intvector {
      2, -31, -1, 3600, 2, 9, -31, -1, 3600, 2, 3600
-  } //_#12
+  } //_#10
    EUAsia:intvector {
      2, -31, -1, 3600, 2, 9, -31, -1, 3600, 2, 3600
-  } //_#13
+  } //_#11
    Egypt:intvector {
      3, -30, -6, 0, 1, 8, -30, -5, 86400, 0, 3600
-  } //_#14
+  } //_#12
    Fiji:intvector {
      10, 1, -1, 7200, 0, 0, 18, -1, 10800, 0, 3600
-  } //_#15
+  } //_#13
    Haiti:intvector {
      2, 8, -1, 7200, 0, 10, 1, -1, 7200, 0, 3600
-  } //_#16
+  } //_#14
    Jordan:intvector {
      2, -31, -5, 86400, 0, 9, -31, -6, 0, 1, 3600
-  } //_#17
+  } //_#15
    LH:intvector {
      9, 1, -1, 7200, 0, 3, 1, -1, 7200, 0, 1800
-  } //_#18
+  } //_#16
    Lebanon:intvector {
      2, -31, -1, 0, 0, 9, -31, -1, 0, 0, 3600
-  } //_#19
+  } //_#17
    Mexico:intvector {
      3, 1, -1, 7200, 0, 9, -31, -1, 7200, 0, 3600
-  } //_#20
+  } //_#18
    Morocco:intvector {
      2, -31, -1, 7200, 0, 9, -31, -1, 10800, 0, 3600
-  } //_#21
+  } //_#19
    NZ:intvector {
      8, -30, -1, 7200, 1, 3, 1, -1, 7200, 1, 3600
-  } //_#22
+  } //_#20
    Namibia:intvector {
      8, 1, -1, 7200, 0, 3, 1, -1, 7200, 0, 3600
-  } //_#23
+  } //_#21
    Palestine:intvector {
      2, -31, -5, 86400, 0, 8, 21, -6, 0, 0, 3600
-  } //_#24
+  } //_#22
    Para:intvector {
      9, 1, -1, 0, 0, 2, 22, -1, 0, 0, 3600
-  } //_#25
+  } //_#23
    Syria:intvector {
      2, -31, -6, 0, 0, 9, -31, -6, 0, 0, 3600
-  } //_#26
+  } //_#24
    SystemV:intvector {
      3, -30, -1, 7200, 0, 9, -31, -1, 7200, 0, 3600
-  } //_#27
+  } //_#25
    Thule:intvector {
      2, 8, -1, 7200, 0, 10, 1, -1, 7200, 0, 3600
-  } //_#28
+  } //_#26
    Troll:intvector {
      2, -31, -1, 3600, 2, 9, -31, -1, 3600, 2, 7200
-  } //_#29
+  } //_#27
    US:intvector {
      2, 8, -1, 7200, 0, 10, 1, -1, 7200, 0, 3600
-  } //_#30
+  } //_#28
    Uruguay:intvector {
      9, 1, -1, 7200, 0, 2, 8, -1, 7200, 0, 3600
-  } //_#31
+  } //_#29
    WS:intvector {
      8, -30, -1, 10800, 0, 3, 1, -1, 14400, 0, 3600
-  } //_#32
+  } //_#30
    Zion:intvector {
      2, 23, -6, 7200, 0, 9, -31, -1, 7200, 0, 3600
-  } //_#33
+  } //_#31
   }
   Regions:array {
    "AU", //Z#0 ACT
diff --git a/icuSources/data/unidata/ApplePUApatch.txt b/icuSources/data/unidata/ApplePUApatch.txt

index 473b1555ab044e4ffe47319d189221a6b81cec0b..0b8b765244e685909de1b4172a53c2cad67127d8 100644 (file)
--- a/icuSources/data/unidata/ApplePUApatch.txt
+++ b/icuSources/data/unidata/ApplePUApatch.txt
@@ -1,5 +1,5 @@
---- base_unidata/DerivedCoreProperties.txt     2014-03-09 16:20:00.000000000 -0700
-+++ DerivedCoreProperties.txt  2014-03-10 06:35:56.000000000 -0700
+--- base_unidata/DerivedCoreProperties.txt     2014-03-16 23:30:07.000000000 -0700
++++ DerivedCoreProperties.txt  2014-10-31 21:56:40.000000000 -0700
  @@ -162,6 +162,9 @@
   29FE..2AFF    ; Math
   2B30..2B44    ; Math
@@ -90,6 +90,15 @@
   F900..FA6D    ; Grapheme_Base
   FA70..FAD9    ; Grapheme_Base
   FB00..FB06    ; Grapheme_Base
+@@ -9979,7 +9998,7 @@
+ 1F3A0..1F3C4  ; Grapheme_Base
+ 1F3C6..1F3CA  ; Grapheme_Base
+ 1F3E0..1F3F0  ; Grapheme_Base
+-1F400..1F43E  ; Grapheme_Base
++1F3FB..1F43E  ; Grapheme_Base
+ 1F440         ; Grapheme_Base
+ 1F442..1F4F7  ; Grapheme_Base
+ 1F4F9..1F4FC  ; Grapheme_Base
  @@ -9995,7 +10014,7 @@
   2B740..2B81D  ; Grapheme_Base
   2F800..2FA1D  ; Grapheme_Base
@@ -99,8 +108,8 @@
   
   # ================================================
   
---- base_unidata/ppucd.txt     2014-03-09 16:20:00.000000000 -0700
-+++ ppucd.txt  2014-03-10 06:34:27.000000000 -0700
+--- base_unidata/ppucd.txt     2014-03-16 23:30:07.000000000 -0700
++++ ppucd.txt  2014-10-31 22:12:22.000000000 -0700
  @@ -17833,8 +17833,47 @@
   block;DC00..DFFF;age=2.0;blk=Low_Surrogates;gc=Cs;GCB=CN;lb=SG
   # DC00..DFFF Low Surrogates
@@ -151,8 +160,23 @@
   
   block;F900..FAFF;age=1.1;Alpha;blk=CJK_Compat_Ideographs;Comp_Ex;CWKCF;dt=Can;ea=W;gc=Lo;Gr_Base;IDC;Ideo;IDS;lb=ID;NFC_QC=N;NFD_QC=N;NFKC_QC=N;NFKD_QC=N;SB=LE;sc=Hani;XIDC;XIDS
   # F900..FAFF CJK Compatibility Ideographs
---- base_unidata/UnicodeData.txt       2014-03-09 16:20:00.000000000 -0700
-+++ UnicodeData.txt    2014-03-09 16:20:01.000000000 -0700
+@@ -27053,7 +27092,13 @@
+ cp;1F3EE;na=IZAKAYA LANTERN
+ cp;1F3EF;na=JAPANESE CASTLE
+ cp;1F3F0;na=EUROPEAN CASTLE
+-cp;1F3F1..1F3FF;age=NA;bc=L;gc=Cn;-Gr_Base;lb=XX;sc=Zzzz
++cp;1F3F1..1F3FA;age=NA;bc=L;gc=Cn;-Gr_Base;lb=XX;sc=Zzzz
++# Early add of emoji modifiers for Fizpatrick types, slated for Unicode 8.0
++cp;1F3FB;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-1-2
++cp;1F3FC;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-3
++cp;1F3FD;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-4
++cp;1F3FE;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-5
++cp;1F3FF;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-6
+ # Animal symbols
+ cp;1F400;na=RAT
+ cp;1F401;na=MOUSE
+--- base_unidata/UnicodeData.txt       2014-03-16 23:30:07.000000000 -0700
++++ UnicodeData.txt    2014-10-31 21:51:42.000000000 -0700
  @@ -14443,7 +14443,65 @@
   DC00;<Low Surrogate, First>;Cs;0;L;;;;;N;;;;;
   DFFF;<Low Surrogate, Last>;Cs;0;L;;;;;N;;;;;
@@ -220,8 +244,20 @@
   F900;CJK COMPATIBILITY IDEOGRAPH-F900;Lo;0;L;8C48;;;;N;;;;;
   F901;CJK COMPATIBILITY IDEOGRAPH-F901;Lo;0;L;66F4;;;;N;;;;;
   F902;CJK COMPATIBILITY IDEOGRAPH-F902;Lo;0;L;8ECA;;;;N;;;;;
---- norm2/base_norm2/nfc.txt   2014-03-08 23:07:17.000000000 -0800
-+++ norm2/nfc.txt      2014-03-09 16:14:50.000000000 -0700
+@@ -22937,6 +22995,11 @@
+ 1F3EE;IZAKAYA LANTERN;So;0;ON;;;;;N;;;;;
+ 1F3EF;JAPANESE CASTLE;So;0;ON;;;;;N;;;;;
+ 1F3F0;EUROPEAN CASTLE;So;0;ON;;;;;N;;;;;
++1F3FB;EMOJI MODIFIER FITZPATRICK TYPE-1-2;Sk;0;ON;;;;;N;;;;;
++1F3FC;EMOJI MODIFIER FITZPATRICK TYPE-3;Sk;0;ON;;;;;N;;;;;
++1F3FD;EMOJI MODIFIER FITZPATRICK TYPE-4;Sk;0;ON;;;;;N;;;;;
++1F3FE;EMOJI MODIFIER FITZPATRICK TYPE-5;Sk;0;ON;;;;;N;;;;;
++1F3FF;EMOJI MODIFIER FITZPATRICK TYPE-6;Sk;0;ON;;;;;N;;;;;
+ 1F400;RAT;So;0;ON;;;;;N;;;;;
+ 1F401;MOUSE;So;0;ON;;;;;N;;;;;
+ 1F402;OX;So;0;ON;;;;;N;;;;;
+--- norm2/base_norm2/nfc.txt   2014-03-16 23:30:07.000000000 -0700
++++ norm2/nfc.txt      2014-03-16 23:30:07.000000000 -0700
  @@ -272,6 +272,8 @@
   AAC1:230
   AAF6:9
diff --git a/icuSources/data/unidata/DerivedCoreProperties.txt b/icuSources/data/unidata/DerivedCoreProperties.txt

index 35ec6a10cb87053ad6c5c829ec91b84a60ea5abd..9465d4cbd1c62e9fdbf0f7b2f283bcb4900e3e3f 100644 (file)
--- a/icuSources/data/unidata/DerivedCoreProperties.txt
+++ b/icuSources/data/unidata/DerivedCoreProperties.txt
@@ -9998,7 +9998,7 @@ FFFC..FFFD    ; Grapheme_Base
  1F3A0..1F3C4  ; Grapheme_Base
  1F3C6..1F3CA  ; Grapheme_Base
  1F3E0..1F3F0  ; Grapheme_Base
-1F400..1F43E  ; Grapheme_Base
+1F3FB..1F43E  ; Grapheme_Base
  1F440         ; Grapheme_Base
  1F442..1F4F7  ; Grapheme_Base
  1F4F9..1F4FC  ; Grapheme_Base
diff --git a/icuSources/data/unidata/UnicodeData.txt b/icuSources/data/unidata/UnicodeData.txt

index 5bdbce52cdd90e6a29e7a4927f15b7317b446a09..6d199f5bec6eda112879eca81aab935e399da656 100644 (file)
--- a/icuSources/data/unidata/UnicodeData.txt
+++ b/icuSources/data/unidata/UnicodeData.txt
@@ -22995,6 +22995,11 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
  1F3EE;IZAKAYA LANTERN;So;0;ON;;;;;N;;;;;
  1F3EF;JAPANESE CASTLE;So;0;ON;;;;;N;;;;;
  1F3F0;EUROPEAN CASTLE;So;0;ON;;;;;N;;;;;
+1F3FB;EMOJI MODIFIER FITZPATRICK TYPE-1-2;Sk;0;ON;;;;;N;;;;;
+1F3FC;EMOJI MODIFIER FITZPATRICK TYPE-3;Sk;0;ON;;;;;N;;;;;
+1F3FD;EMOJI MODIFIER FITZPATRICK TYPE-4;Sk;0;ON;;;;;N;;;;;
+1F3FE;EMOJI MODIFIER FITZPATRICK TYPE-5;Sk;0;ON;;;;;N;;;;;
+1F3FF;EMOJI MODIFIER FITZPATRICK TYPE-6;Sk;0;ON;;;;;N;;;;;
  1F400;RAT;So;0;ON;;;;;N;;;;;
  1F401;MOUSE;So;0;ON;;;;;N;;;;;
  1F402;OX;So;0;ON;;;;;N;;;;;
diff --git a/icuSources/data/unidata/ppucd.txt b/icuSources/data/unidata/ppucd.txt

index 21e00cf6e83bec5ad6a16126306fb5074ff8bc61..a8713d141e33df2668a9e5d8933ffbdf4e7a1347 100644 (file)
--- a/icuSources/data/unidata/ppucd.txt
+++ b/icuSources/data/unidata/ppucd.txt
@@ -27092,7 +27092,13 @@ cp;1F3ED;na=FACTORY
  cp;1F3EE;na=IZAKAYA LANTERN
  cp;1F3EF;na=JAPANESE CASTLE
  cp;1F3F0;na=EUROPEAN CASTLE
-cp;1F3F1..1F3FF;age=NA;bc=L;gc=Cn;-Gr_Base;lb=XX;sc=Zzzz
+cp;1F3F1..1F3FA;age=NA;bc=L;gc=Cn;-Gr_Base;lb=XX;sc=Zzzz
+# Early add of emoji modifiers for Fizpatrick types, slated for Unicode 8.0
+cp;1F3FB;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-1-2
+cp;1F3FC;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-3
+cp;1F3FD;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-4
+cp;1F3FE;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-5
+cp;1F3FF;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-6
  # Animal symbols
  cp;1F400;na=RAT
  cp;1F401;na=MOUSE
diff --git a/icuSources/i18n/decimfmt.cpp b/icuSources/i18n/decimfmt.cpp

index e9280bc6c27098f1024c1fe1e84f021f5e097a45..207abaf7613fa2bbcddf214e698fef5a29fe671c 100644 (file)
--- a/icuSources/i18n/decimfmt.cpp
+++ b/icuSources/i18n/decimfmt.cpp
@@ -4106,7 +4106,7 @@ void DecimalFormat::setExponentSignAlwaysShown(UBool expSignAlways) {
  int32_t
  DecimalFormat::getGroupingSize() const
  {
-    return fGroupingSize;
+    return isGroupingUsed() ? fGroupingSize : 0;
  }
  
  //------------------------------------------------------------------------------
diff --git a/icuSources/i18n/smpdtfmt.cpp b/icuSources/i18n/smpdtfmt.cpp

index 9198b0540eab15bbd1926413fc635ebdbdda63f2..1e9d791bdd3446de8a2ce1d27c2aa73089c0485f 100644 (file)
--- a/icuSources/i18n/smpdtfmt.cpp
+++ b/icuSources/i18n/smpdtfmt.cpp
@@ -2260,6 +2260,8 @@ int32_t SimpleDateFormat::matchQuarterString(const UnicodeString& text,
  }
  
  //----------------------------------------------------------------------
+#define IS_BIDI_MARK(c) (c==0x200E || c==0x200F || c==0x061C) 
+
  UBool SimpleDateFormat::matchLiterals(const UnicodeString &pattern,
                                        int32_t &patternOffset,
                                        const UnicodeString &text,
@@ -2289,21 +2291,26 @@ UBool SimpleDateFormat::matchLiterals(const UnicodeString &pattern,
              }
          }
          
-        literal += ch;
+        if (!IS_BIDI_MARK(ch)) {
+            literal += ch;
+        }
      }
      
-    // at this point, literal contains the literal text
+    // at this point, literal contains the pattern literal text (without bidi marks)
      // and i is the index of the next non-literal pattern character.
      int32_t p;
      int32_t t = textOffset;
      
      if (whitespaceLenient) {
-        // trim leading, trailing whitespace from
-        // the literal text
+        // trim leading, trailing whitespace from the pattern literal
          literal.trim();
          
-        // ignore any leading whitespace in the text
-        while (t < text.length() && u_isWhitespace(text.charAt(t))) {
+        // ignore any leading whitespace (or bidi marks) in the text
+        while (t < text.length()) {
+            UChar ch = text.charAt(t);
+            if (!u_isWhitespace(ch) && !IS_BIDI_MARK(ch)) {
+                break;
+            }
              t += 1;
          }
      }
@@ -2311,18 +2318,26 @@ UBool SimpleDateFormat::matchLiterals(const UnicodeString &pattern,
      for (p = 0; p < literal.length() && t < text.length();) {
          UBool needWhitespace = FALSE;
          
+        // Skip any whitespace at current position in pattern,
+        // but remember whether we found whitespace in the pattern
+        // (we already deleted any bidi marks in the pattern).
          while (p < literal.length() && PatternProps::isWhiteSpace(literal.charAt(p))) {
              needWhitespace = TRUE;
              p += 1;
          }
          
+        // If the pattern has whitespace at this point, skip it in text as well
+        // (if the text does not have any, that may be an error for strict parsing)
          if (needWhitespace) {
-            int32_t tStart = t;
+            UBool whitespaceInText = FALSE;
              
+            // Skip any whitespace (or bidi marks) at current position in text,
+            // but remember whether we found whitespace in the text at this point.
              while (t < text.length()) {
                  UChar tch = text.charAt(t);
-                
-                if (!u_isUWhiteSpace(tch) && !PatternProps::isWhiteSpace(tch)) {
+                if (u_isUWhiteSpace(tch) || PatternProps::isWhiteSpace(tch)) {
+                    whitespaceInText = TRUE;
+                } else if (!IS_BIDI_MARK(tch)) {
                      break;
                  }
                  
@@ -2332,7 +2347,7 @@ UBool SimpleDateFormat::matchLiterals(const UnicodeString &pattern,
              // TODO: should we require internal spaces
              // in lenient mode? (There won't be any
              // leading or trailing spaces)
-            if (!whitespaceLenient && t == tStart) {
+            if (!whitespaceLenient && !whitespaceInText) {
                  // didn't find matching whitespace:
                  // an error in strict mode
                  return FALSE;
@@ -2343,6 +2358,11 @@ UBool SimpleDateFormat::matchLiterals(const UnicodeString &pattern,
              if (p >= literal.length()) {
                  break;
              }
+        } else {
+            // Still need to skip any bidi marks in the text
+            while (t < text.length() && IS_BIDI_MARK(text.charAt(t))) {
+                ++t;
+            }
          }
          
          if (t >= text.length() || literal.charAt(p) != text.charAt(t)) {
@@ -2387,7 +2407,7 @@ UBool SimpleDateFormat::matchLiterals(const UnicodeString &pattern,
          for (t = textOffset; t < text.length(); t += 1) {
              UChar ch = text.charAt(t);
              
-            if (ignorables == NULL || !ignorables->contains(ch)) {
+            if (!IS_BIDI_MARK(ch) && (ignorables == NULL || !ignorables->contains(ch))) {
                  break;
              }
          }
diff --git a/icuSources/test/cintltst/cloctst.c b/icuSources/test/cintltst/cloctst.c

index feeaf7664dc25f549463d663bd468e0afa337ef4..a97f4adab1411be6dc4da6b2541e1bc4b3a3ca6c 100644 (file)
--- a/icuSources/test/cintltst/cloctst.c
+++ b/icuSources/test/cintltst/cloctst.c
@@ -46,6 +46,8 @@ static void TestDisplayNameBrackets(void);
  
  static void TestUnicodeDefines(void);
  
+static void TestGetAppleParent(void);
+
  void PrintDataTable();
  
  /*---------------------------------------------------
@@ -248,6 +250,7 @@ void addLocaleTest(TestNode** root)
      TESTCASE(TestUnicodeDefines);
      TESTCASE(TestEnglishExemplarCharacters);
      TESTCASE(TestDisplayNameBrackets);
+    TESTCASE(TestGetAppleParent);
  }
  
  
@@ -5877,3 +5880,77 @@ static void TestUnicodeDefines(void) {
    TEST_UNICODE_DEFINE(ULOC_KEYWORD_ASSIGN, ULOC_KEYWORD_ASSIGN_UNICODE);
    TEST_UNICODE_DEFINE(ULOC_KEYWORD_ITEM_SEPARATOR, ULOC_KEYWORD_ITEM_SEPARATOR_UNICODE);
  }
+
+/* Apple-specific, test for Apple-specific function ualoc_getAppleParent */
+static const char* localesAndAppleParent[] = {
+    "en",               "root",
+    "en-US",            "en",
+    "en-CA",            "en",
+    "en-001",           "en",
+    "en_001",           "en",
+    "en-GB",            "en_001",
+    "en_GB",            "en_001",
+    "en-IN",            "en_GB",
+    "en-AU",            "en_GB",
+    "es",               "root",
+    "es-ES",            "es",
+    "es-419",           "es",
+    "es_419",           "es",
+    "es-MX",            "es_419",
+    "es-AR",            "es_419",
+    "fr",               "root",
+    "fr-CA",            "fr",
+    "fr-CH",            "fr",
+    "haw",              "root",
+    "nl",               "root",
+    "nl-BE",            "nl",
+    "pt",               "root",
+    "pt-BR",            "pt",
+    "pt-PT",            "pt",
+    "pt-MO",            "pt_PT",
+    "sr",               "root",
+    "sr-Cyrl",          "sr",
+    "sr-Latn",          "root",
+    "tlh",              "root",
+    "zh_CN",            "root",
+    "zh-CN",            "root",
+    "zh",               "zh_CN",
+    "zh-Hans",          "zh",
+    "zh_TW",            "root",
+    "zh-TW",            "root",
+    "zh-Hant",          "zh_TW",
+    "zh_Hant",          "zh_TW",
+    "zh-Hant-HK",       "zh_Hant",
+    "zh_Hant_HK",       "zh_Hant",
+    "zh-Hant-MO",       "zh_Hant_HK",
+    "zh-Hans-HK",       "zh_Hans",
+    "root",             "root",
+    "en-Latn",          "en",
+    "en-Latn-US",       "en_Latn",
+    "en_US_POSIX",      "en_US",
+    "en_Latn_US_POSIX", "en_Latn_US",
+    "en-u-ca-hebrew",   "root",
+    "en@calendar=hebrew", "root",
+    "en_@calendar=hebrew", "root",
+    "en-",              "root",
+    "en_",              "root",
+    "Default@2x",       "root",
+    "default",          "root",
+    NULL /* terminator */
+};
+
+static void TestGetAppleParent() {
+    const char **localesPtr = localesAndAppleParent;
+    const char * locale;
+    while ((locale = *localesPtr++) != NULL) {
+        const char * expectParent = *localesPtr++;
+        UErrorCode status = U_ZERO_ERROR;
+        char getParent[ULOC_FULLNAME_CAPACITY];
+        int32_t plen = ualoc_getAppleParent(locale, getParent, ULOC_FULLNAME_CAPACITY, &status);
+        if (U_FAILURE(status)) {
+            log_err("FAIL: ualoc_getAppleParent input \"%s\", status %s\n", locale, u_errorName(status));
+        } else if (uprv_strcmp(expectParent, getParent) != 0) {
+            log_err("FAIL: ualoc_getAppleParent input \"%s\", expected parent \"%s\", got parent \"%s\"\n", locale, expectParent, getParent);
+        }
+    }
+}
diff --git a/icuSources/test/intltest/dcfmapts.cpp b/icuSources/test/intltest/dcfmapts.cpp

index 850aa97dde9eb07ed059eed3f9e0e750889ffae4..5eea88334559cd8340988f084f3f92d094dfa5b5 100644 (file)
--- a/icuSources/test/intltest/dcfmapts.cpp
+++ b/icuSources/test/intltest/dcfmapts.cpp
@@ -103,6 +103,14 @@ void IntlTestDecimalFormatAPI::testAPI(/*char *par*/)
          return;
      }
  
+    // bug 10864
+    status = U_ZERO_ERROR;
+    DecimalFormat noGrouping("###0.##", status);
+    if (noGrouping.getGroupingSize() != 0) {
+      errln("Grouping size should be 0 for no grouping.");
+    }
+    // end bug 10864
+
      status = U_ZERO_ERROR;
      const UnicodeString pattern("#,##0.# FF");
      DecimalFormat pat(pattern, status);
diff --git a/icuSources/test/intltest/measfmttest.cpp b/icuSources/test/intltest/measfmttest.cpp

index afdb3e325015b6f6092c72d749a7d953cf9a23cb..3665880bb0e49e55226e625774b4fb56ff33b3cb 100644 (file)
--- a/icuSources/test/intltest/measfmttest.cpp
+++ b/icuSources/test/intltest/measfmttest.cpp
@@ -375,10 +375,10 @@ void MeasureFormatTest::TestFormatPeriodEn() {
  
      ExpectedResult abbrevData[] = {
              {t_1m_59_9996s, LENGTHOF(t_1m_59_9996s), "1 min, 59.9996 secs"},
-            {t_19m, LENGTHOF(t_19m), "19 mins"},
+            {t_19m, LENGTHOF(t_19m), "19 min"},
              {t_1h_23_5s, LENGTHOF(t_1h_23_5s), "1 hr, 23.5 secs"},
-            {t_1h_23_5m, LENGTHOF(t_1h_23_5m), "1 hr, 23.5 mins"},
-            {t_1h_0m_23s, LENGTHOF(t_1h_0m_23s), "1 hr, 0 mins, 23 secs"},
+            {t_1h_23_5m, LENGTHOF(t_1h_23_5m), "1 hr, 23.5 min"},
+            {t_1h_0m_23s, LENGTHOF(t_1h_0m_23s), "1 hr, 0 min, 23 secs"},
              {t_2y_5M_3w_4d, LENGTHOF(t_2y_5M_3w_4d), "2 yrs, 5 mths, 3 wks, 4 days"}};
  
      ExpectedResult narrowData[] = {
diff --git a/icuSources/test/intltest/rbbitst.cpp b/icuSources/test/intltest/rbbitst.cpp

index 86b5b6ce15fc61df0411e65f7afe62a3e2ed654c..7abf93c4da4c9f450f4058925bdf2b98b8488942 100644 (file)
--- a/icuSources/test/intltest/rbbitst.cpp
+++ b/icuSources/test/intltest/rbbitst.cpp
@@ -1017,7 +1017,7 @@ void RBBITest::TestExtended() {
      tp.srcLine        = new UVector32(status);
      tp.srcCol         = new UVector32(status);
  
-    RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
+    RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@=-]*) *>"), 0, status);
      if (U_FAILURE(status)) {
          dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
      }
diff --git a/icuSources/test/intltest/tztest.cpp b/icuSources/test/intltest/tztest.cpp

index 715d1c1f9ba43cfb224eb0ec28626c46a0fa24bb..696957253fe8e658540220f7e2d7f005450731ab 100644 (file)
--- a/icuSources/test/intltest/tztest.cpp
+++ b/icuSources/test/intltest/tztest.cpp
@@ -1,6 +1,6 @@
  /***********************************************************************
   * COPYRIGHT: 
- * Copyright (c) 1997-2014, International Business Machines Corporation
+ * Copyright (c) 1997-2015, International Business Machines Corporation
   * and others. All Rights Reserved.
   ***********************************************************************/
  
@@ -2028,7 +2028,11 @@ void TimeZoneTest::TestCanonicalID() {
          {"America/Virgin", "America/Anguilla"},
          {"Antarctica/South_Pole", "Antarctica/McMurdo"},
          {"Arctic/Longyearbyen", "Europe/Oslo"},
+        {"Asia/Kuwait", "Asia/Aden"},
+        {"Asia/Muscat", "Asia/Dubai"},
          {"Asia/Phnom_Penh", "Asia/Bangkok"},
+        {"Asia/Qatar", "Asia/Bahrain"},
+        {"Asia/Riyadh", "Asia/Aden"},
          {"Asia/Vientiane", "Asia/Bangkok"},
          {"Atlantic/Jan_Mayen", "Europe/Oslo"},
          {"Atlantic/St_Helena", "Africa/Abidjan"},
diff --git a/icuSources/test/testdata/rbbitst.txt b/icuSources/test/testdata/rbbitst.txt

index 7e10d0c18eda428b9a8dad5e9419a73858d8d870..f5477d1fc59f550ced979fef9f8b6f21608e6e8b 100644 (file)
--- a/icuSources/test/testdata/rbbitst.txt
+++ b/icuSources/test/testdata/rbbitst.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2001-2014 International Business Machines
+# Copyright (c) 2001-2015 International Business Machines
  # Corporation and others. All Rights Reserved.
  #
  # RBBI Test Data
@@ -886,3 +886,110 @@ Bangkok)•</data>
  <data>•abc/\u05D9 •def•</data>
  <data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data>
  <data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA•</data>
+
+####################################################################################
+#
+#  Test CSS line break variants: strict, normal, loose
+#
+####################################################################################
+
+<locale ja@lb=strict>
+<line>
+#     •no brk before 3063              •no brk before 301C•no brk btw 2026   •no brk before FF01•
+<data>•\u3084\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>
+
+<locale ja@lb=normal>
+<line>
+#     •brk OK before 3063               •brk OK before 301C •no brk btw 2026   •no brk before FF01•
+<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>
+
+<locale ja@lb=loose>
+<line>
+#     •brk OK before 3063               •brk OK before 301C •brk OK btw 2026    •brk OK before FF01•
+<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020•</data>
+
+<locale en@lb=strict>
+<line>
+#     •no brk before 3063              •no brk before 301C•no brk btw 2026   •no brk before FF01•
+<data>•\u3084\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>
+
+<locale en@lb=normal>
+<line>
+#     •brk OK before 3063               •no brk before 301C •no brk btw 2026  •no brk before FF01•
+<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>
+
+<locale en@lb=loose>
+<line>
+#     •brk OK before 3063               •no brk before 301C •brk OK btw 2026   •no brk before FF01•
+<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026•\u2026\u0020•u30A2\uFF01\u0020•</data>
+
+####################################################################################
+#
+#  Test Apple breaks for emoji clusters (same for all locales and break types)
+#
+####################################################################################
+
+<locale root>
+
+<char>
+# woman zwj woman zwj girl zwj girl, woman/fitz-1-2 zwj woman/fitz-4 zwj boy/fitz-6
+<data>•\U0001F469\u200D\U0001F469\u200D\U0001F467\u200D\U0001F467•\U0001F469\U0001F3FB\u200D\U0001F469\U0001F3FD\u200D\U0001F466\U0001F3FF•</data>
+# woman zwj, baby/fitz-3, older_woman/fitz-5, runner/fitz-4, raised_fist/fitz-3, fuel_pump, fitz-3
+<data>•\U0001F469\u200D•\U0001F476\U0001F3FC•\U0001F475\U0001F3FE•\U0001F3C3\U0001F3FD•\u270A\U0001F3FC•\u26FD•\U0001F3FC•</data>
+# man zwj  hvy_blk_heart zwj  man, woman, man zwj hvy_blk_heart esel zwj  man, woman
+<data>•\U0001F468\u200D\u2764\u200D\U0001F468•\U0001F469•\U0001F468\u200D\u2764\uFE0F\u200D\U0001F468•\U0001F469•</data>
+# woman zwj hvy_blk_heart/esel zwj kiss_mark zwj woman, man
+<data>•\U0001F469\u200D\u2764\uFE0F\u200D\U0001F48B\u200D\U0001F469•\U0001F468•</data>
+# victory_hand esel, victory_hand/esel/fitz-1-2, victory_hand/fitz-1-2, rowboat/fitz-4, vulcan_salute/fitz-5, space, 
+<data>•\u270C\uFE0F•\u270C\uFE0F\U0001F3FB•\u270C\U0001F3FB•\U0001F6A3\U0001F3FD•\U0001F596\U0001F3FE•\u0020•</data>
+# flags1    AE                   AU                   AT                   BE                   BR                   CA                   CL
+<data>•\U0001F1E6\U0001F1EA•\U0001F1E6\U0001F1FA•\U0001F1E6\U0001F1F9•\U0001F1E7\U0001F1EA•\U0001F1E7\U0001F1F7•\U0001F1E8\U0001F1E6•\U0001F1E8\U0001F1F1•</data>
+# flags2    CN                   CO                   DK                   FI                   FR                   DE                   HK
+<data>•\U0001F1E8\U0001F1F3•\U0001F1E8\U0001F1F4•\U0001F1E9\U0001F1F0•\U0001F1EB\U0001F1EE•\U0001F1EB\U0001F1F7•\U0001F1E9\U0001F1EA•\U0001F1ED\U0001F1F0•</data>
+# flags3    IN                   ID                   IE                   IL                   IT                   JP                   KR
+<data>•\U0001F1EE\U0001F1F3•\U0001F1EE\U0001F1E9•\U0001F1EE\U0001F1EA•\U0001F1EE\U0001F1F1•\U0001F1EE\U0001F1F9•\U0001F1EF\U0001F1F5•\U0001F1F0\U0001F1F7•</data>
+
+<word>
+# woman zwj woman zwj girl zwj girl, woman/fitz-1-2 zwj woman/fitz-4 zwj boy/fitz-6
+<data>•\U0001F469\u200D\U0001F469\u200D\U0001F467\u200D\U0001F467•\U0001F469\U0001F3FB\u200D\U0001F469\U0001F3FD\u200D\U0001F466\U0001F3FF•</data>
+# woman zwj, baby/fitz-3, older_woman/fitz-5, runner/fitz-4, raised_fist/fitz-3, fuel_pump, fitz-3
+<data>•\U0001F469\u200D•\U0001F476\U0001F3FC•\U0001F475\U0001F3FE•\U0001F3C3\U0001F3FD•\u270A\U0001F3FC•\u26FD•\U0001F3FC•</data>
+# man zwj  hvy_blk_heart zwj  man, woman, man zwj hvy_blk_heart esel zwj  man, woman
+<data>•\U0001F468\u200D\u2764\u200D\U0001F468•\U0001F469•\U0001F468\u200D\u2764\uFE0F\u200D\U0001F468•\U0001F469•</data>
+# woman zwj hvy_blk_heart esel zwj kiss mark zwj woman, man
+<data>•\U0001F469\u200D\u2764\uFE0F\u200D\U0001F48B\u200D\U0001F469•\U0001F468•</data>
+# victory_hand esel, victory_hand/esel/fitz-1-2, victory_hand/fitz-1-2, rowboat/fitz-4, vulcan_salute/fitz-5, space, 
+<data>•\u270C\uFE0F•\u270C\uFE0F\U0001F3FB•\u270C\U0001F3FB•\U0001F6A3\U0001F3FD•\U0001F596\U0001F3FE•\u0020•</data>
+# flags1    AE                   AU                   AT                   BE                   BR                   CA                   CL
+<data>•\U0001F1E6\U0001F1EA•\U0001F1E6\U0001F1FA•\U0001F1E6\U0001F1F9•\U0001F1E7\U0001F1EA•\U0001F1E7\U0001F1F7•\U0001F1E8\U0001F1E6•\U0001F1E8\U0001F1F1•</data>
+# flags2    CN                   CO                   DK                   FI                   FR                   DE                   HK
+<data>•\U0001F1E8\U0001F1F3•\U0001F1E8\U0001F1F4•\U0001F1E9\U0001F1F0•\U0001F1EB\U0001F1EE•\U0001F1EB\U0001F1F7•\U0001F1E9\U0001F1EA•\U0001F1ED\U0001F1F0•</data>
+# flags3    IN                   ID                   IE                   IL                   IT                   JP                   KR
+<data>•\U0001F1EE\U0001F1F3•\U0001F1EE\U0001F1E9•\U0001F1EE\U0001F1EA•\U0001F1EE\U0001F1F1•\U0001F1EE\U0001F1F9•\U0001F1EF\U0001F1F5•\U0001F1F0\U0001F1F7•</data>
+
+<line>
+# woman zwj woman zwj girl zwj girl # (line, skip this for now, need safe rules and we don't generate it:) woman/fitz-1-2 zwj woman/fitz-4 zwj boy/fitz-6
+<data>•\U0001F469\u200D\U0001F469\u200D\U0001F467\u200D\U0001F467•</data>
+# woman zwj, baby/fitz-3, older_woman/fitz-5, runner/fitz-4, raised_fist/fitz-3, fuel_pump, fitz-3
+<data>•\U0001F469\u200D•\U0001F476\U0001F3FC•\U0001F475\U0001F3FE•\U0001F3C3\U0001F3FD•\u270A\U0001F3FC•\u26FD•\U0001F3FC•</data>
+# man zwj  hvy_blk_heart zwj  man, woman, man zwj hvy_blk_heart esel zwj  man, woman
+<data>•\U0001F468\u200D\u2764\u200D\U0001F468•\U0001F469•\U0001F468\u200D\u2764\uFE0F\u200D\U0001F468•\U0001F469•</data>
+# woman zwj hvy_blk_heart esel zwj kiss mark zwj woman, man
+<data>•\U0001F469\u200D\u2764\uFE0F\u200D\U0001F48B\u200D\U0001F469•\U0001F468•</data>
+# victory_hand esel, victory_hand/esel/fitz-1-2, victory_hand/fitz-1-2, rowboat/fitz-4, vulcan_salute/fitz-5 space, 
+<data>•\u270C\uFE0F•\u270C\uFE0F\U0001F3FB•\u270C\U0001F3FB•\U0001F6A3\U0001F3FD•\U0001F596\U0001F3FE\u0020•</data>
+# no special flags handling for line
+
+<locale ja@lb=loose>
+<line>
+# woman zwj woman zwj girl zwj girl # (line, skip this for now, need safe rules and we don't generate it:) woman/fitz-1-2 zwj woman/fitz-4 zwj boy/fitz-6
+<data>•\U0001F469\u200D\U0001F469\u200D\U0001F467\u200D\U0001F467•</data>
+# woman zwj, baby/fitz-3, older_woman/fitz-5, runner/fitz-4, raised_fist/fitz-3, fuel_pump, fitz-3
+<data>•\U0001F469\u200D•\U0001F476\U0001F3FC•\U0001F475\U0001F3FE•\U0001F3C3\U0001F3FD•\u270A\U0001F3FC•\u26FD•\U0001F3FC•</data>
+# man zwj  hvy_blk_heart zwj  man, woman, man zwj hvy_blk_heart esel zwj  man, woman
+<data>•\U0001F468\u200D\u2764\u200D\U0001F468•\U0001F469•\U0001F468\u200D\u2764\uFE0F\u200D\U0001F468•\U0001F469•</data>
+# woman zwj hvy_blk_heart esel zwj kiss mark zwj woman, man
+<data>•\U0001F469\u200D\u2764\uFE0F\u200D\U0001F48B\u200D\U0001F469•\U0001F468•</data>
+# victory_hand esel, victory_hand/esel/fitz-1-2, victory_hand/fitz-1-2, rowboat/fitz-4, vulcan_salute/fitz-5 space, 
+<data>•\u270C\uFE0F•\u270C\uFE0F\U0001F3FB•\u270C\U0001F3FB•\U0001F6A3\U0001F3FD•\U0001F596\U0001F3FE\u0020•</data>
+# no special flags handling for line
diff --git a/makefile b/makefile

index a2d181b3387b49dc0d778d78160f91adf2493d04..9f503e3e46ca50ff852435d62c557f01038e9ced 100644 (file)
--- a/makefile
+++ b/makefile
@@ -239,7 +239,9 @@ else
  endif
  
  # even for a crossbuild host build, we want to use the target's latest tzdata as pointed to by latest_tzdata.tar.gz
-export TZDATA:=$(SDKPATH)/usr/local/share/tz/$(shell readlink $(SDKPATH)/usr/local/share/tz/latest_tzdata.tar.gz)
+ifeq "$(shell test -d $(SDKPATH)/usr/local/share/tz && echo YES )" "YES"
+       export TZDATA:=$(SDKPATH)/usr/local/share/tz/$(shell readlink $(SDKPATH)/usr/local/share/tz/latest_tzdata.tar.gz)
+endif
  $(info # TZDATA=$(TZDATA))
  
  ifeq "$(WINDOWS)" "YES"
author	Apple <opensource@apple.com>
	Tue, 10 Mar 2015 17:57:33 +0000 (17:57 +0000)
committer	Apple <opensource@apple.com>
	Tue, 10 Mar 2015 17:57:33 +0000 (17:57 +0000)
icuSources/common/brkiter.cpp		patch \| blob \| blame \| history
icuSources/common/ualoc.cpp		patch \| blob \| blame \| history
icuSources/common/ubidi_props_data.h		patch \| blob \| blame \| history
icuSources/common/uchar_props_data.h		patch \| blob \| blame \| history
icuSources/data/brkitr/brkfiles.mk		patch \| blob \| blame \| history
icuSources/data/brkitr/char.txt		patch \| blob \| blame \| history
icuSources/data/brkitr/fi.txt		patch \| blob \| blame \| history
icuSources/data/brkitr/ja.txt		patch \| blob \| blame \| history
icuSources/data/brkitr/line.txt		patch \| blob \| blame \| history
icuSources/data/brkitr/line_fi.txt		patch \| blob \| blame \| history
icuSources/data/brkitr/line_ja.txt		patch \| blob \| blame \| history
icuSources/data/brkitr/line_loose.txt	[new file with mode: 0644]	patch \| blob
icuSources/data/brkitr/line_loose_cj.txt	[new file with mode: 0644]	patch \| blob
icuSources/data/brkitr/line_loose_fi.txt	[new file with mode: 0644]	patch \| blob
icuSources/data/brkitr/line_normal.txt	[new file with mode: 0644]	patch \| blob
icuSources/data/brkitr/line_normal_cj.txt	[new file with mode: 0644]	patch \| blob
icuSources/data/brkitr/line_normal_fi.txt	[new file with mode: 0644]	patch \| blob
icuSources/data/brkitr/root.txt		patch \| blob \| blame \| history
icuSources/data/brkitr/word.txt		patch \| blob \| blame \| history
icuSources/data/brkitr/word_POSIX.txt		patch \| blob \| blame \| history
icuSources/data/brkitr/zh.txt	[new file with mode: 0644]	patch \| blob
icuSources/data/brkitr/zh_Hant.txt	[new file with mode: 0644]	patch \| blob
icuSources/data/curr/supplementalData.txt		patch \| blob \| blame \| history
icuSources/data/in/ubidi.icu		patch \| blob \| blame \| history
icuSources/data/in/unames.icu		patch \| blob \| blame \| history
icuSources/data/in/uprops.icu		patch \| blob \| blame \| history
icuSources/data/locales/da.txt		patch \| blob \| blame \| history
icuSources/data/locales/en.txt		patch \| blob \| blame \| history
icuSources/data/locales/es.txt		patch \| blob \| blame \| history
icuSources/data/locales/es_419.txt		patch \| blob \| blame \| history
icuSources/data/locales/es_MX.txt		patch \| blob \| blame \| history
icuSources/data/locales/fr.txt		patch \| blob \| blame \| history
icuSources/data/locales/hu.txt		patch \| blob \| blame \| history
icuSources/data/locales/it.txt		patch \| blob \| blame \| history
icuSources/data/locales/it_CH.txt		patch \| blob \| blame \| history
icuSources/data/locales/ms.txt		patch \| blob \| blame \| history
icuSources/data/locales/pl.txt		patch \| blob \| blame \| history
icuSources/data/locales/sv.txt		patch \| blob \| blame \| history
icuSources/data/locales/tr.txt		patch \| blob \| blame \| history
icuSources/data/locales/zh_Hans_HK.txt		patch \| blob \| blame \| history
icuSources/data/locales/zh_Hans_MO.txt		patch \| blob \| blame \| history
icuSources/data/locales/zh_Hant.txt		patch \| blob \| blame \| history
icuSources/data/locales/zh_Hant_HK.txt		patch \| blob \| blame \| history
icuSources/data/locales/zh_Hant_MO.txt		patch \| blob \| blame \| history
icuSources/data/misc/metaZones.txt		patch \| blob \| blame \| history
icuSources/data/misc/windowsZones.txt		patch \| blob \| blame \| history
icuSources/data/misc/zoneinfo64.txt		patch \| blob \| blame \| history
icuSources/data/unidata/ApplePUApatch.txt		patch \| blob \| blame \| history
icuSources/data/unidata/DerivedCoreProperties.txt		patch \| blob \| blame \| history
icuSources/data/unidata/UnicodeData.txt		patch \| blob \| blame \| history
icuSources/data/unidata/ppucd.txt		patch \| blob \| blame \| history
icuSources/i18n/decimfmt.cpp		patch \| blob \| blame \| history
icuSources/i18n/smpdtfmt.cpp		patch \| blob \| blame \| history
icuSources/test/cintltst/cloctst.c		patch \| blob \| blame \| history
icuSources/test/intltest/dcfmapts.cpp		patch \| blob \| blame \| history
icuSources/test/intltest/measfmttest.cpp		patch \| blob \| blame \| history
icuSources/test/intltest/rbbitst.cpp		patch \| blob \| blame \| history
icuSources/test/intltest/tztest.cpp		patch \| blob \| blame \| history
icuSources/test/testdata/rbbitst.txt		patch \| blob \| blame \| history
makefile		patch \| blob \| blame \| history