}
// -------------------------------------
+enum { kLBTypeLenMax = 32 };
BreakIterator*
BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
if (U_FAILURE(status)) {
return NULL;
}
+ char lbType[kLBTypeLenMax];
BreakIterator *result = NULL;
switch (kind) {
result = BreakIterator::buildInstance(loc, "word", kind, status);
break;
case UBRK_LINE:
- result = BreakIterator::buildInstance(loc, "line", kind, status);
+ uprv_strcpy(lbType, "line");
+ {
+ char lbKeyValue[kLBTypeLenMax];
+ UErrorCode kvStatus = U_ZERO_ERROR;
+ loc.getKeywordValue("lb", lbKeyValue, kLBTypeLenMax, kvStatus);
+ if (U_SUCCESS(kvStatus) && (uprv_strcmp(lbKeyValue,"strict")==0 || uprv_strcmp(lbKeyValue,"normal")==0 || uprv_strcmp(lbKeyValue,"loose")==0)) {
+ uprv_strcat(lbType, "_");
+ uprv_strcat(lbType, lbKeyValue);
+ }
+ }
+ result = BreakIterator::buildInstance(loc, lbType, kind, status);
break;
case UBRK_SENTENCE:
result = BreakIterator::buildInstance(loc, "sentence", kind, status);
int32_t len;
UErrorCode tempStatus;
char locbuf[ULOC_FULLNAME_CAPACITY+1];
+ char * foundDoubleUnderscore;
if (U_FAILURE(*err)) {
return 0;
*err = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
- len = uloc_canonicalize(localeID, locbuf, ULOC_FULLNAME_CAPACITY, err);
+ len = uloc_getBaseName(localeID, locbuf, ULOC_FULLNAME_CAPACITY, err); /* canonicalize and strip keywords */
if (U_FAILURE(*err)) {
return 0;
}
locbuf[ULOC_FULLNAME_CAPACITY] = 0;
*err = U_ZERO_ERROR;
}
+ foundDoubleUnderscore = uprv_strstr(locbuf, "__"); /* __ comes from bad/missing subtag or variant */
+ if (foundDoubleUnderscore != NULL) {
+ *foundDoubleUnderscore = 0; /* terminate at the __ */
+ len = uprv_strlen(locbuf);
+ }
if (len >= 2 && uprv_strncmp(locbuf, "zh", 2) == 0) {
const char ** forceParentPtr = forceParent;
const char * testCurLoc;
#if !U_PLATFORM_IS_DARWIN_BASED
static const int32_t ubidi_props_indexes[UBIDI_IX_TOP]={0x10,0x53f0,0x50b8,0x1a,0x620,0x8b0,0,0,0,0,0,0,0,0,0,0x3902b6};
#else /* !U_PLATFORM_IS_DARWIN_BASED */
-static const int32_t ubidi_props_indexes[UBIDI_IX_TOP]={0x10,0x54f0,0x51b8,0x1a,0x620,0x8b0,0,0,0,0,0,0,0,0,0,0x3902b6};
+static const int32_t ubidi_props_indexes[UBIDI_IX_TOP]={0x10,0x54e8,0x51b0,0x1a,0x620,0x8b0,0,0,0,0,0,0,0,0,0,0x3902b6};
#endif /* !U_PLATFORM_IS_DARWIN_BASED */
#if !U_PLATFORM_IS_DARWIN_BASED
static const uint16_t ubidi_props_trieIndex[10324]={
#else /* !U_PLATFORM_IS_DARWIN_BASED */
-static const uint16_t ubidi_props_trieIndex[10452]={
+static const uint16_t ubidi_props_trieIndex[10448]={
#endif /* !U_PLATFORM_IS_DARWIN_BASED */
0x320,0x328,0x330,0x338,0x350,0x358,0x360,0x368,0x340,0x348,0x340,0x348,0x340,0x348,0x340,0x348,
0x340,0x348,0x340,0x348,0x36e,0x376,0x37e,0x386,0x38e,0x396,0x392,0x39a,0x3a2,0x3aa,0x3a5,0x3ad,
0x3d5,0x3d5,0x3d5,0x43f,0x43f,0x43f,0x43f,0x43f,0x43f,0x43f,0x9a7,0x3d5,0x3d5,0x3d5,0x3d5,0x3d5,
0x3d5,0x3d5,0x3d5,0x5ee,0x7b9,0x5ee,0x5ee,0x5f1,0x9b7,0x9bf,0x340,0x9af,0x340,0x340,0x9c7,0x340,
0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x5ee,0x9cf,0x5ee,0x9d5,0x5f1,
-0x5ee,0x9dd,0x9e5,0x5ee,0x9ed,0x9f5,0x5ee,0x5ee,0x5ee,0x5ee,0x9f7,0x5ee,0x9ff,0xa07,0x7f0,0x340,
-0x340,0x340,0x6fb,0x5ee,0x5ee,0xa0f,0x340,0x5ee,0x5ee,0x6f9,0x340,0x5ee,0x5ee,0x5ee,0x5f1,0x340,
+0x5ee,0x9dd,0x9e5,0x5ee,0x9ec,0x9f4,0x5ee,0x5ee,0x5ee,0x5ee,0x9f6,0x5ee,0x9fe,0xa06,0x7f0,0x340,
+0x340,0x340,0x6fb,0x5ee,0x5ee,0xa0e,0x340,0x5ee,0x5ee,0x6f9,0x340,0x5ee,0x5ee,0x5ee,0x5f1,0x340,
#endif /* !U_PLATFORM_IS_DARWIN_BASED */
0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,
0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,
0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,
0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0xa04,0x31f,
#else /* !U_PLATFORM_IS_DARWIN_BASED */
-0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0xa13,0xa23,
-0xa1b,0xa1b,0xa1b,0xa24,0xa24,0xa24,0xa24,0x38e,0x38e,0x38e,0x38e,0x38e,0x38e,0x38e,0xa2c,0xa24,
-0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,
-0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,
-0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,
-0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0xa24,0x31f,
+0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0x340,0xa12,0xa22,
+0xa1a,0xa1a,0xa1a,0xa23,0xa23,0xa23,0xa23,0x38e,0x38e,0x38e,0x38e,0x38e,0x38e,0x38e,0xa2b,0xa23,
+0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,
+0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,
+0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,
+0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0xa23,0x31f,
#endif /* !U_PLATFORM_IS_DARWIN_BASED */
0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,8,7,8,9,7,0x12,0x12,
0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,7,7,7,8,
0xa,0,0,0,0xa,0xa,0xa,0xa,0xa,0,0xa,0xa,0xa,0xa,0xa,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+#if !U_PLATFORM_IS_DARWIN_BASED
0xa,0xa,0xa,0xa,0xa,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+#else /* !U_PLATFORM_IS_DARWIN_BASED */
+0xa,0xa,0xa,0xa,0xa,0,0,0,0,0,0,0,0,0,0,0xa,
+#endif /* !U_PLATFORM_IS_DARWIN_BASED */
0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+#if !U_PLATFORM_IS_DARWIN_BASED
0xa,0xa,0xa,0,0xa,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+#else /* !U_PLATFORM_IS_DARWIN_BASED */
+0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0,
+0xa,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+#endif /* !U_PLATFORM_IS_DARWIN_BASED */
0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+#if !U_PLATFORM_IS_DARWIN_BASED
0xa,0xa,0xa,0xa,0,0xa,0xa,0xa,0xa,0,0,0,0xa,0xa,0xa,0xa,
+#else /* !U_PLATFORM_IS_DARWIN_BASED */
+0,0xa,0xa,0xa,0xa,0,0,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+#endif /* !U_PLATFORM_IS_DARWIN_BASED */
0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+#if !U_PLATFORM_IS_DARWIN_BASED
0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0,0,0xa,0xa,0xa,0xa,
0,0,0,0,0,0,0,0,0,0,0,0,0xa,0xa,0xa,0xa,
0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0,0,0,
0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0,0,0,0,
+#else /* !U_PLATFORM_IS_DARWIN_BASED */
+0xa,0xa,0xa,0xa,0xa,0xa,0,0,0xa,0xa,0xa,0xa,0,0,0,0,
+0,0,0,0,0,0,0,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
+0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0,0,0,0,0xa,0xa,0xa,
+0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0,0,0,0,0,0,0,0,
+#endif /* !U_PLATFORM_IS_DARWIN_BASED */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+#if !U_PLATFORM_IS_DARWIN_BASED
0,0,0,0,0,0,0,0,0,0,0x12,0x12,0xb2,0xb2,0xb2,0xb2,
+#else /* !U_PLATFORM_IS_DARWIN_BASED */
+0,0,0,0,0,0,0x12,0x12,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,
+#endif /* !U_PLATFORM_IS_DARWIN_BASED */
0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,
+#if !U_PLATFORM_IS_DARWIN_BASED
0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0x12,0xb2,0x12,0x12,
0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,
0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,
0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,
+#else /* !U_PLATFORM_IS_DARWIN_BASED */
+0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0xb2,0x12,0xb2,0x12,0x12,0x12,0x12,0x12,0x12,
+#endif /* !U_PLATFORM_IS_DARWIN_BASED */
0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,
+#if !U_PLATFORM_IS_DARWIN_BASED
0,0,0,0
+#else /* !U_PLATFORM_IS_DARWIN_BASED */
+0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0xb1,0xb1,0xb1,0xb1,
+0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0x12,0x12,0x12,0x12,
+0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,0,0,0,0
+#endif /* !U_PLATFORM_IS_DARWIN_BASED */
};
static const uint32_t ubidi_props_mirrors[26]={
#if !U_PLATFORM_IS_DARWIN_BASED
7124,
#else /* !U_PLATFORM_IS_DARWIN_BASED */
- 7252,
+ 7248,
#endif /* !U_PLATFORM_IS_DARWIN_BASED */
0x1a0,
0xd00,
#if !U_PLATFORM_IS_DARWIN_BASED
0x2850,
#else /* !U_PLATFORM_IS_DARWIN_BASED */
- 0x28d0,
+ 0x28cc,
#endif /* !U_PLATFORM_IS_DARWIN_BASED */
NULL, 0, FALSE, FALSE, 0, NULL
},
0x1b,0,0x1b,0x1b,0x1b,0x1b,0x1b,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0x1b,0x1b,0x1b,0x1b,
0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0x1b,0,0x1b,0x1b,
+0,0,0,0,0,0,0,0x1a,0x1a,0x1a,0x1a,0x1a,0x1b,0,0x1b,0x1b,
#endif /* !U_PLATFORM_IS_DARWIN_BASED */
0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,
#if !U_PLATFORM_IS_DARWIN_BASED
0x1266,0x1266,0x1266,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,
0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,
0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,
-0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1b3,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,
+0x1b3,0x1b3,0x1b3,0x136b,0x136b,0x136b,0x136b,0x136b,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,
0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,
0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1b3,0x1266,0x1b3,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,
0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,0x1266,
#if !U_PLATFORM_IS_DARWIN_BASED
static const uint32_t propsVectors[4917]={
#else /* !U_PLATFORM_IS_DARWIN_BASED */
-static const uint32_t propsVectors[4971]={
+static const uint32_t propsVectors[4974]={
#endif /* !U_PLATFORM_IS_DARWIN_BASED */
0x67,0,0,0x67,0x80000,0x20,0x867,0,0,0xa67,0,0,0xb67,0,0,0xc67,
0,0,0xd67,0,0,0xe67,0,0,0xf67,0,0,0x1067,0,0,0x1167,0,
0x6804400,0x962540,0x6100d997,0x7c00100,0x230400,0x6100d997,0xc000010,0x448000,0x6100da98,0x6800000,0x1329800,0x6100da98,0x7c00100,0x230400,0x6100db71,0x4000000,
0x200000,0x6100dc99,0x2802100,0x962460,0x6100dc99,0x2802400,0x962460,0x6100dc99,0x6800000,0x1329800,0x6100dc99,0x6800100,0x962540,0x6100dc99,0x6804400,0x962540,
0x6100dc99,0x7c00100,0x230400,0x610a4711,0x7c40300,0xe30000,0x610a4f11,0x7c00300,0xe30001,0x6140af2d,0x6800100,0x962540,0x6180af2d,0x2802400,0x962460,0x62002a00,
-0x4000000,0x1600000,0x63002800,0x80000,0x918820,0x63c00c09,0x80000,0x918820,0x70002a00,0x4000000,0x1600000};
+0x4000000,0x1600000,0x63002800,0x80000,0x918820,0x63c00c09,0x80000,0x918820,0x70002a00,0x4000000,0x1600000,0x8000cd00,0x4000000,0xe00000};
#endif /* !U_PLATFORM_IS_DARWIN_BASED */
#if !U_PLATFORM_IS_DARWIN_BASED
static const int32_t countPropsVectors=4917;
#else /* !U_PLATFORM_IS_DARWIN_BASED */
-static const int32_t countPropsVectors=4971;
+static const int32_t countPropsVectors=4974;
#endif /* !U_PLATFORM_IS_DARWIN_BASED */
static const int32_t propsVectorsColumns=3;
static const uint16_t scriptExtensions[74]={
#if !U_PLATFORM_IS_DARWIN_BASED
static const int32_t indexes[UPROPS_INDEX_COUNT]={0x231a,0x231a,0x231a,0x231a,0x50da,3,0x640f,0x6434,0x6434,0x6434,0xadca0,0x2774191,0,0,0,0};
#else /* !U_PLATFORM_IS_DARWIN_BASED */
-static const int32_t indexes[UPROPS_INDEX_COUNT]={0x235c,0x235c,0x235c,0x235c,0x517e,3,0x64e9,0x650e,0x650e,0x650e,0xadca0,0x2774191,0,0,0,0};
+static const int32_t indexes[UPROPS_INDEX_COUNT]={0x235c,0x235c,0x235c,0x235c,0x517e,3,0x64ec,0x6511,0x6511,0x6511,0xadca0,0x2774191,0,0,0,0};
#endif /* !U_PLATFORM_IS_DARWIN_BASED */
# List of break iterator files (brk).
-BRK_SOURCE = char.txt line.txt line_fi.txt line_ja.txt\
+BRK_SOURCE = char.txt line.txt line_fi.txt\
+ line_loose.txt line_loose_cj.txt line_loose_fi.txt\
+ line_normal.txt line_normal_cj.txt line_normal_fi.txt\
sent.txt sent_el.txt title.txt word.txt word_POSIX.txt
# Ordinary resources
BRK_RES_SOURCE = de.txt el.txt en.txt en_US.txt\
en_US_POSIX.txt es.txt fi.txt fr.txt it.txt\
- ja.txt pt.txt ru.txt
+ ja.txt pt.txt ru.txt zh.txt zh_Hant.txt
#
-# Copyright (C) 2002-2013, International Business Machines Corporation and others.
+# Copyright (C) 2002-2015, International Business Machines Corporation and others.
# All Rights Reserved.
#
# file: char.txt
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
-$RI_A = \U0001F1E6; # Trail ERTU
-$RI_B = \U0001F1E7; # Trail EGR
-$RI_C = \U0001F1E8; # Trail AHLNZ
+$RI_A = \U0001F1E6; # Trail ETU
+$RI_B = \U0001F1E7; # Trail ER
+$RI_C = \U0001F1E8; # Trail AHLNO
$RI_D = \U0001F1E9; # Trail EK
-$RI_E = \U0001F1EA; # Trail GS
+$RI_E = \U0001F1EA; # Trail S
$RI_F = \U0001F1EB; # Trail IR
-$RI_G = \U0001F1EC; # Trail BR
-$RI_H = \U0001F1ED; # Trail KU
-$RI_I = \U0001F1EE; # Trail DLNT
+$RI_G = \U0001F1EC; # Trail B
+$RI_H = \U0001F1ED; # Trail K
+$RI_I = \U0001F1EE; # Trail DELNT
$RI_J = \U0001F1EF; # Trail OP
$RI_K = \U0001F1F0; # Trail R
-$RI_L = \U0001F1F1; # Trail B
$RI_M = \U0001F1F2; # Trail OXY
-$RI_N = \U0001F1F3; # Trail LO
-$RI_P = \U0001F1F5; # Trail LT
-$RI_R = \U0001F1F7; # Trail OU
-$RI_S = \U0001F1F8; # Trail AEGK
-$RI_T = \U0001F1F9; # Trail HRW
-$RI_U = \U0001F1FA; # Trail AS
+$RI_N = \U0001F1F3; # Trail LOZ
+$RI_P = \U0001F1F5; # Trail HLRT
+$RI_R = \U0001F1F7; # Trail U
+$RI_S = \U0001F1F8; # Trail AEG
+$RI_T = \U0001F1F9; # Trail R
+$RI_U = \U0001F1FA; # Trail S
$RI_V = \U0001F1FB; # Trail N
+$RI_Z = \U0001F1FF; # Trail A
+# unused trail values, safe as addl lead: C F J M Q V W
+# unused lead values, safe as addl trail: L O Q W X Y
-$RI_A_End = [\U0001F1EA \U0001F1F7 \U0001F1F9 \U0001F1FA]; # ERTU
-$RI_B_End = [\U0001F1EA \U0001F1EC \U0001F1F7]; # EGR
-$RI_C_End = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1FF]; # AHLNZ
+$RI_A_End = [\U0001F1EA \U0001F1F9 \U0001F1FA]; # ETU
+$RI_B_End = [\U0001F1EA \U0001F1F7]; # ER
+$RI_C_End = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1F4]; # AHLNO
$RI_D_End = [\U0001F1EA \U0001F1F0]; # EK
-$RI_E_End = [\U0001F1EC \U0001F1F8]; # GS
+$RI_E_End = \U0001F1F8; # S
$RI_F_End = [\U0001F1EE \U0001F1F7]; # IR
-$RI_G_End = [\U0001F1E7 \U0001F1F7]; # BR
-$RI_H_End = [\U0001F1F0 \U0001F1FA]; # KU
-$RI_I_End = [\U0001F1E9 \U0001F1F1 \U0001F1F3 \U0001F1F9]; # DLNT
-$RI_J_End = [\U0001F1F4 \U0001F1F5]; # OP
+$RI_G_End = \U0001F1E7; # B
+$RI_H_End = \U0001F1F0; # K
+$RI_I_End = [\U0001F1E9 \U0001F1EA \U0001F1F1 \U0001F1F3 \U0001F1F9]; # DELNT
+$RI_J_End = [\U0001F1F5 \U0001F1F4]; # OP
$RI_K_End = \U0001F1F7; # R
-$RI_L_End = \U0001F1E7; # B
$RI_M_End = [\U0001F1F4 \U0001F1FD \U0001F1FE]; # OXY
-$RI_N_End = [\U0001F1F1 \U0001F1F4]; # LO
-$RI_P_End = [\U0001F1F1 \U0001F1F9]; # LT
-$RI_R_End = [\U0001F1F4 \U0001F1FA]; # OU
-$RI_S_End = [\U0001F1E6 \U0001F1EA \U0001F1EC \U0001F1F0]; # AEGK
-$RI_T_End = [\U0001F1ED \U0001F1F7 \U0001F1FC]; # HRW
-$RI_U_End = [\U0001F1E6 \U0001F1F8]; # AS
+$RI_N_End = [\U0001F1F1 \U0001F1F4 \U0001F1FF]; # LOZ
+$RI_P_End = [\U0001F1ED \U0001F1F1 \U0001F1F7 \U0001F1F9]; # HLRT
+$RI_R_End = \U0001F1FA; # U
+$RI_S_End = [\U0001F1E6 \U0001F1EA \U0001F1EC]; # AEG
+$RI_T_End = \U0001F1F7; # R
+$RI_U_End = \U0001F1F8; # S
$RI_V_End = \U0001F1F3; # N
+$RI_Z_End = \U0001F1E6; # A
+
+# Special character classes for people & body part emoji:
+# Subsets of $Extend:
+$ZWJ = \u200D;
+$EmojiVar = [\uFE0F];
+# The following are subsets of \p{Grapheme_Cluster_Break = Other} which is not otherwise used here
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
#
# Korean Syllable Definitions
$RI_I $RI_I_End;
$RI_J $RI_J_End;
$RI_K $RI_K_End;
-$RI_L $RI_L_End;
$RI_M $RI_M_End;
$RI_N $RI_N_End;
$RI_P $RI_P_End;
$RI_T $RI_T_End;
$RI_U $RI_U_End;
$RI_V $RI_V_End;
+$RI_Z $RI_Z_End;
[^$Control $CR $LF] $Extend;
[^$Control $CR $LF] $SpacingMark;
# TODO: Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF];
+# Special forward rules for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
+$ZWJ $EmojiForSeqs;
+$EmojiForMods $EmojiVar? $EmojiMods;
+
## -------------------------------------------------
$RI_I_End $RI_I;
$RI_J_End $RI_J;
$RI_K_End $RI_K;
-$RI_L_End $RI_L;
$RI_M_End $RI_M;
$RI_N_End $RI_N;
$RI_P_End $RI_P;
$RI_T_End $RI_T;
$RI_U_End $RI_U;
$RI_V_End $RI_V;
+$RI_Z_End $RI_Z;
$Extend [^$Control $CR $LF];
$SpacingMark [^$Control $CR $LF];
# TODO: Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend;
+# Special reverse rules for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
+$EmojiForSeqs $ZWJ;
+$EmojiMods $EmojiVar? $EmojiForMods;
## -------------------------------------------------
# We don't logically need safe char break rules, but if we don't provide any at all
!!safe_reverse;
$LF $CR;
+[$EmojiVar $EmojiMods]+ $EmojiForMods;
## -------------------------------------------------
!!safe_forward;
$CR $LF;
+$EmojiForMods [$EmojiVar $EmojiMods]+;
Version{"2.0.82.42"}
boundaries{
line:process(dependency){"line_fi.brk"}
+ line_loose:process(dependency){"line_loose_fi.brk"}
+ line_normal:process(dependency){"line_normal_fi.brk"}
+ line_strict:process(dependency){"line_fi.brk"}
}
}
ja{
Version{"2.0.82.42"}
boundaries{
- line:process(dependency){"line_ja.brk"}
+ line:process(dependency){"line_normal.brk"}
+ line_loose:process(dependency){"line_loose_cj.brk"}
+ line_normal:process(dependency){"line_normal_cj.brk"}
+ line_strict:process(dependency){"line.brk"}
}
}
-# Copyright (c) 2002-2013 International Business Machines Corporation and
+# Copyright (c) 2002-2015 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line.txt
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
+#
+# This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
+# It sets characters of class CJ to behave like NS.
#
# Character Classes defined by TR 14.
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
# 5.0 or later as the definition of Complex_Context was corrected to include all
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
# LB 30a Do not break between regional indicators.
$RIcm $RIcm;
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
#
# Reverse Rules.
#
# Requires an engine enhancement.
# / $SP* $ZW
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# LB 30a
$CM* $RI $CM* $RI;
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
## -------------------------------------------------
!!safe_reverse;
-# Copyright (c) 2002-2013 International Business Machines Corporation and
+# Copyright (c) 2002-2015 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line_fi.txt
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 29 for Unicode 6.2
# http://www.unicode.org/reports/tr14/
+# tailored as noted in 2nd paragraph below..
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
+#
+# This tailors the line break behavior for Finnish, while otherwise behaving
+# per UAX 14 which corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
+# It sets characters of class CJ to behave like NS.
#
# Character Classes defined by TR 14.
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
# 5.0 or later as the definition of Complex_Context was corrected to include all
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
# LB 30a Do not break between regional indicators.
$RIcm $RIcm;
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
#
# Reverse Rules.
#
# Requires an engine enhancement.
# / $SP* $ZW
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# LB 30a
$CM* $RI $CM* $RI;
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
## -------------------------------------------------
!!safe_reverse;
-# Copyright (c) 2002-2013 International Business Machines Corporation and
+# Copyright (c) 2002-2015 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line_ja.txt
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
# 5.0 or later as the definition of Complex_Context was corrected to include all
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
# LB 30a Do not break between regional indicators.
$RIcm $RIcm;
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
#
# Reverse Rules.
#
# Requires an engine enhancement.
# / $SP* $ZW
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# LB 30a
$CM* $RI $CM* $RI;
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
## -------------------------------------------------
!!safe_reverse;
--- /dev/null
+# Copyright (c) 2002-2015 International Business Machines Corporation and
+# others. All Rights Reserved.
+#
+# file: line_loose.txt
+#
+# Line Breaking Rules
+# Implement default line breaking as defined by
+# Unicode Standard Annex #14 Revision 29 for Unicode 6.2
+# http://www.unicode.org/reports/tr14/
+# tailored as noted in 2nd paragraph below..
+#
+# TODO: Rule LB 8 remains as it was in Unicode 5.2
+# This is only because of a limitation of ICU break engine implementation,
+# not because the older behavior is desirable.
+#
+# This tailors the line break behavior to correspond to CSS
+# line-break=loose (BCP47 -u-lb-loose) as defined for languages other than
+# Chinese & Japanese.
+# It sets characters of class CJ to behave like ID.
+# In addition, it allows breaks:
+# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
+# * between characters of LineBreak class IN
+
+#
+# Character Classes defined by TR 14.
+#
+
+!!chain;
+!!LBCMNoChain;
+
+
+!!lookAheadHardBreak;
+#
+# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
+# and only used for the line break rules.
+#
+# It is used in the implementation of rule LB 10
+# which says to treat any combining mark that is not attached to a base
+# character as if it were of class AL (alphabetic).
+#
+# The problem occurs in the reverse rules.
+#
+# Consider a sequence like, with correct breaks as shown
+# LF ID CM AL AL
+# ^ ^ ^
+# Then consider the sequence without the initial ID (ideographic)
+# LF CM AL AL
+# ^ ^
+# Our CM, which in the first example was attached to the ideograph,
+# is now unattached, becomes an alpha, and joins in with the other
+# alphas.
+#
+# When iterating forwards, these sequences do not present any problems
+# When iterating backwards, we need to look ahead when encountering
+# a CM to see whether it attaches to something further on or not.
+# (Look-ahead in a reverse rule is looking towards the start)
+#
+# If the CM is unattached, we need to force a break.
+#
+# !!lookAheadHardBreak forces the run time state machine to
+# stop immediately when a look ahead rule ( '/' operator) matches,
+# and set the match position to that of the look-ahead operator,
+# no matter what other rules may be in play at the time.
+#
+# See rule LB 19 for an example.
+#
+
+$AI = [:LineBreak = Ambiguous:];
+$AL = [:LineBreak = Alphabetic:];
+$BA = [:LineBreak = Break_After:];
+$BB = [:LineBreak = Break_Before:];
+$BK = [:LineBreak = Mandatory_Break:];
+$B2 = [:LineBreak = Break_Both:];
+$CB = [:LineBreak = Contingent_Break:];
+$CJ = [:LineBreak = Conditional_Japanese_Starter:];
+$CL = [:LineBreak = Close_Punctuation:];
+$CM = [:LineBreak = Combining_Mark:];
+$CP = [:LineBreak = Close_Parenthesis:];
+$CR = [:LineBreak = Carriage_Return:];
+$EX = [:LineBreak = Exclamation:];
+$GL = [:LineBreak = Glue:];
+$HL = [:LineBreak = Hebrew_Letter:];
+$HY = [:LineBreak = Hyphen:];
+$H2 = [:LineBreak = H2:];
+$H3 = [:LineBreak = H3:];
+$ID = [[:LineBreak = Ideographic:] $CJ];
+$IN = [:LineBreak = Inseperable:];
+$IS = [:LineBreak = Infix_Numeric:];
+$JL = [:LineBreak = JL:];
+$JV = [:LineBreak = JV:];
+$JT = [:LineBreak = JT:];
+$LF = [:LineBreak = Line_Feed:];
+$NL = [:LineBreak = Next_Line:];
+$NSX = [\u3005 \u303B \u309D \u309E \u30FD \u30FE];
+$NS = [[:LineBreak = Nonstarter:] - $NSX];
+$NU = [:LineBreak = Numeric:];
+$OP = [:LineBreak = Open_Punctuation:];
+$PO = [:LineBreak = Postfix_Numeric:];
+$PR = [:LineBreak = Prefix_Numeric:];
+$QU = [:LineBreak = Quotation:];
+$RI = [:LineBreak = Regional_Indicator:];
+$SA = [:LineBreak = Complex_Context:];
+$SG = [:LineBreak = Surrogate:];
+$SP = [:LineBreak = Space:];
+$SY = [:LineBreak = Break_Symbols:];
+$WJ = [:LineBreak = Word_Joiner:];
+$XX = [:LineBreak = Unknown:];
+$ZW = [:LineBreak = ZWSpace:];
+
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
+# Dictionary character set, for triggering language-based break engines. Currently
+# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+# 5.0 or later as the definition of Complex_Context was corrected to include all
+# characters requiring dictionary break.
+
+$dictionary = [:LineBreak = Complex_Context:];
+
+#
+# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
+# SA (South East Asian: Thai, Lao, Khmer)
+# SG (Unpaired Surrogates)
+# XX (Unknown, unassigned)
+# as $AL (Alphabetic)
+#
+$ALPlus = [$AL $AI $SA $SG $XX];
+
+#
+# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
+#
+$ALcm = $ALPlus $CM*;
+$BAcm = $BA $CM*;
+$BBcm = $BB $CM*;
+$B2cm = $B2 $CM*;
+$CLcm = $CL $CM*;
+$CPcm = $CP $CM*;
+$EXcm = $EX $CM*;
+$GLcm = $GL $CM*;
+$HLcm = $HL $CM*;
+$HYcm = $HY $CM*;
+$H2cm = $H2 $CM*;
+$H3cm = $H3 $CM*;
+$IDcm = $ID $CM*;
+$INcm = $IN $CM*;
+$IScm = $IS $CM*;
+$JLcm = $JL $CM*;
+$JVcm = $JV $CM*;
+$JTcm = $JT $CM*;
+$NScm = $NS $CM*;
+$NSXcm = $NSX $CM*;
+$NUcm = $NU $CM*;
+$OPcm = $OP $CM*;
+$POcm = $PO $CM*;
+$PRcm = $PR $CM*;
+$QUcm = $QU $CM*;
+$RIcm = $RI $CM*;
+$SYcm = $SY $CM*;
+$WJcm = $WJ $CM*;
+
+## -------------------------------------------------
+
+!!forward;
+
+#
+# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
+#
+$ALPlus $CM+;
+$BA $CM+;
+$BB $CM+;
+$B2 $CM+;
+$CL $CM+;
+$CP $CM+;
+$EX $CM+;
+$GL $CM+;
+$HL $CM+;
+$HY $CM+;
+$H2 $CM+;
+$H3 $CM+;
+$ID $CM+;
+$IN $CM+;
+$IS $CM+;
+$JL $CM+;
+$JV $CM+;
+$JT $CM+;
+$NS $CM+;
+$NSX $CM+;
+$NU $CM+;
+$OP $CM+;
+$PO $CM+;
+$PR $CM+;
+$QU $CM+;
+$RI $CM+;
+$SY $CM+;
+$WJ $CM+;
+
+#
+# CAN_CM is the set of characters that may combine with CM combining chars.
+# Note that Linebreak UAX 14's concept of a combining char and the rules
+# for what they can combine with are _very_ different from the rest of Unicode.
+#
+# Note that $CM itself is left out of this set. If CM is needed as a base
+# it must be listed separately in the rule.
+#
+$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs
+$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
+
+#
+# AL_FOLLOW set of chars that can unconditionally follow an AL
+# Needed in rules where stand-alone $CM s are treated as AL.
+# Chaining is disabled with CM because it causes other failures,
+# so for this one case we need to manually list out longer sequences.
+#
+$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
+$AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $NSX $IN $NU $ALPlus];
+$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
+
+
+#
+# Rule LB 4, 5 Mandatory (Hard) breaks.
+#
+$LB4Breaks = [$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL];
+$CR $LF {100};
+
+#
+# LB 6 Do not break before hard line breaks.
+#
+$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
+$CAN_CM $CM* $LB4Breaks {100};
+$CM+ $LB4Breaks {100};
+
+# LB 7 x SP
+# x ZW
+$LB4NonBreaks [$SP $ZW];
+$CAN_CM $CM* [$SP $ZW];
+$CM+ [$SP $ZW];
+
+#
+# LB 8 Break after zero width space
+# TODO: ZW SP* <break>
+# An engine change is required to write the reverse rule for this.
+# For now, leave the Unicode 5.2 rule, ZW <break>
+#
+$LB8Breaks = [$LB4Breaks $ZW];
+$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+
+
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
+# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
+# $CM not covered by the above needs to behave like $AL
+# See definition of $CAN_CM.
+
+$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
+$CM+;
+
+#
+# LB 11 Do not break before or after WORD JOINER & related characters.
+#
+$CAN_CM $CM* $WJcm;
+$LB8NonBreaks $WJcm;
+$CM+ $WJcm;
+
+$WJcm $CANT_CM;
+$WJcm $CAN_CM $CM*;
+
+#
+# LB 12 Do not break after NBSP and related characters.
+# GL x
+#
+$GLcm $CAN_CM $CM*;
+$GLcm $CANT_CM;
+
+#
+# LB 12a Do not break before NBSP and related characters ...
+# [^SP BA HY] x GL
+#
+[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
+$CM+ GLcm;
+
+
+
+#
+# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces.
+#
+$LB8NonBreaks $CL;
+$CAN_CM $CM* $CL;
+$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $CP;
+$CAN_CM $CM* $CP;
+$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $EX;
+$CAN_CM $CM* $EX;
+$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $IS;
+$CAN_CM $CM* $IS;
+$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $SY;
+$CAN_CM $CM* $SY;
+$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
+
+
+#
+# LB 14 Do not break after OP, even after spaces
+#
+$OPcm $SP* $CAN_CM $CM*;
+$OPcm $SP* $CANT_CM;
+
+$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
+
+# LB 15
+$QUcm $SP* $OPcm;
+
+# LB 16
+# Do not break between closing punctuation and $NS, even with intervening spaces
+# But DO allow a break between closing punctuation and $NSX, don't include it here
+($CLcm | $CPcm) $SP* $NScm;
+
+# LB 17
+$B2cm $SP* $B2cm;
+
+#
+# LB 18 Break after spaces.
+#
+$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
+$LB18Breaks = [$LB8Breaks $SP];
+
+
+# LB 19
+# x QU
+$LB18NonBreaks $CM* $QUcm;
+$CM+ $QUcm;
+
+# QU x
+$QUcm .?;
+$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
+ # TODO: I don't think this rule is needed.
+
+
+# LB 20
+# <break> $CB
+# $CB <break>
+
+$LB20NonBreaks = [$LB18NonBreaks - $CB];
+
+# LB 21 x (BA | HY | NS)
+# BB x
+#
+# DO allow breaks here before NSXcm, so don't include it
+$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
+
+$BBcm [^$CB]; # $BB x
+$BBcm $LB20NonBreaks $CM*;
+
+# LB 21a Don't break after Hebrew + Hyphen
+# HL (HY | BA) x
+#
+$HLcm ($HYcm | $BAcm) [^$CB]?;
+
+# LB 21b (forward) Don't break between SY and HL
+# (break between HL and SY already disallowed by LB 13 above)
+$SYcm $HLcm;
+
+# LB 22
+($ALcm | $HLcm) $INcm;
+$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
+$IDcm $INcm;
+# $INcm $INcm; # delete this rule for CSS loose
+$NUcm $INcm;
+
+
+# $LB 23
+$IDcm $POcm;
+$ALcm $NUcm; # includes $LB19
+$HLcm $NUcm;
+$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
+$NUcm $ALcm;
+$NUcm $HLcm;
+
+#
+# LB 24
+#
+$PRcm $IDcm;
+$PRcm ($ALcm | $HLcm);
+$POcm ($ALcm | $HLcm);
+
+#
+# LB 25 Numbers.
+#
+($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
+
+# LB 26 Do not break a Korean syllable
+#
+$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
+($JVcm | $H2cm) ($JVcm | $JTcm);
+($JTcm | $H3cm) $JTcm;
+
+# LB 27 Treat korean Syllable Block the same as ID (don't break it)
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
+$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
+
+
+# LB 28 Do not break between alphabetics
+#
+($ALcm | $HLcm) ($ALcm | $HLcm);
+$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
+
+# LB 29
+$IScm ($ALcm | $HLcm);
+
+# LB 30
+($ALcm | $HLcm | $NUcm) $OPcm;
+$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
+$CPcm ($ALcm | $HLcm | $NUcm);
+
+# LB 30a Do not break between regional indicators.
+$RIcm $RIcm;
+
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
+#
+# Reverse Rules.
+#
+## -------------------------------------------------
+
+!!reverse;
+
+$CM+ $ALPlus;
+$CM+ $BA;
+$CM+ $BB;
+$CM+ $B2;
+$CM+ $CL;
+$CM+ $CP;
+$CM+ $EX;
+$CM+ $GL;
+$CM+ $HL;
+$CM+ $HY;
+$CM+ $H2;
+$CM+ $H3;
+$CM+ $ID;
+$CM+ $IN;
+$CM+ $IS;
+$CM+ $JL;
+$CM+ $JV;
+$CM+ $JT;
+$CM+ $NS;
+$CM+ $NSX;
+$CM+ $NU;
+$CM+ $OP;
+$CM+ $PO;
+$CM+ $PR;
+$CM+ $QU;
+$CM+ $RI;
+$CM+ $SY;
+$CM+ $WJ;
+$CM+;
+
+
+#
+# Sequences of the form (shown forwards)
+# [CANT_CM] <break> [CM] [whatever]
+# The CM needs to behave as an AL
+#
+$AL_FOLLOW $CM+ / (
+ [$BK $CR $LF $NL $ZW {eof}] |
+ $SP+ $CM+ $SP |
+ $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
+ # LB14 says OP SP* x .
+ # becomes OP SP* x AL
+ # becomes OP SP* x CM+ AL_FOLLOW
+ #
+ # Further note: the $AL in [$AL {eof}] is only to work around
+ # a rule compiler bug which complains about
+ # empty sets otherwise.
+
+#
+# Sequences of the form (shown forwards)
+# [CANT_CM] <break> [CM] <break> [PR]
+# The CM needs to behave as an AL
+# This rule is concerned about getting the second of the two <breaks> in place.
+#
+
+[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
+
+
+
+# LB 4, 5, 5
+
+$LB4Breaks [$LB4NonBreaks-$CM];
+$LB4Breaks $CM+ $CAN_CM;
+$LF $CR;
+
+
+# LB 7 x SP
+# x ZW
+[$SP $ZW] [$LB4NonBreaks-$CM];
+[$SP $ZW] $CM+ $CAN_CM;
+
+# LB 8 ZW SP* <break>
+# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
+# Requires an engine enhancement.
+# / $SP* $ZW
+
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
+# LB 9,10 Combining marks.
+# X $CM needs to behave like X, where X is not $SP or controls.
+# $CM not covered by the above needs to behave like $AL
+# Stick together any combining sequences that don't match other rules.
+$CM+ $CAN_CM;
+
+
+# LB 11
+$CM* $WJ $CM* $CAN_CM;
+$CM* $WJ [$LB8NonBreaks-$CM];
+
+ $CANT_CM $CM* $WJ;
+$CM* $CAN_CM $CM* $WJ;
+
+# LB 12a
+# [^SP BA HY] x GL
+#
+$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
+
+# LB 12
+# GL x
+#
+$CANT_CM $CM* $GL;
+$CM* $CAN_CM $CM* $GL;
+
+
+# LB 13
+$CL $CM+ $CAN_CM;
+$CP $CM+ $CAN_CM;
+$EX $CM+ $CAN_CM;
+$IS $CM+ $CAN_CM;
+$SY $CM+ $CAN_CM;
+
+$CL [$LB8NonBreaks-$CM];
+$CP [$LB8NonBreaks-$CM];
+$EX [$LB8NonBreaks-$CM];
+$IS [$LB8NonBreaks-$CM];
+$SY [$LB8NonBreaks-$CM];
+
+# Rule 13 & 14 taken together for an edge case.
+# Match this, shown forward
+# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
+# This really wants to chain at the $CM+ (which is acting as an $AL)
+# except for $CM chaining being disabled.
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
+
+# LB 14 OP SP* x
+#
+$CM* $CAN_CM $SP* $CM* $OP;
+ $CANT_CM $SP* $CM* $OP;
+$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
+
+ $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
+$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
+
+
+
+# LB 15
+$CM* $OP $SP* $CM* $QU;
+
+# LB 16
+# Don't include $NSX here
+$CM* $NS $SP* $CM* ($CL | $CP);
+
+# LB 17
+$CM* $B2 $SP* $CM* $B2;
+
+# LB 18 break after spaces
+# Nothing explicit needed here.
+
+
+#
+# LB 19
+#
+$CM* $QU $CM* $CAN_CM; # . x QU
+$CM* $QU $LB18NonBreaks;
+
+
+$CM* $CAN_CM $CM* $QU; # QU x .
+ $CANT_CM $CM* $QU;
+
+#
+# LB 20 Break before and after CB.
+# nothing needed here.
+#
+
+# LB 21
+# Don't include $NSX here
+$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
+
+$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
+[^$CB] $CM* $BB; #
+
+# LB21a
+[^$CB] $CM* ($HY | $BA) $CM* $HL;
+
+# LB21b (reverse)
+$CM* $HL $CM* $SY;
+
+# LB 22
+$CM* $IN $CM* ($ALPlus | $HL);
+$CM* $IN $CM* $ID;
+# $CM* $IN $CM* $IN; # delete this rule for CSS loose
+$CM* $IN $CM* $NU;
+
+# LB 23
+$CM* $PO $CM* $ID;
+$CM* $NU $CM* ($ALPlus | $HL);
+$CM* ($ALPlus | $HL) $CM* $NU;
+
+# LB 24
+$CM* $ID $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PO;
+
+
+# LB 25
+($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
+
+# LB 26
+$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
+$CM* ($JT | $JV) $CM* ($H2 | $JV);
+$CM* $JT $CM* ($H3 | $JT);
+
+# LB 27
+$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+
+# LB 28
+$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+
+
+# LB 29
+$CM* ($ALPlus | $HL) $CM* $IS;
+
+# LB 30
+$CM* $OP $CM* ($ALPlus | $HL | $NU);
+$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+
+# LB 30a
+$CM* $RI $CM* $RI;
+
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# LB 9
+$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+$CM+ $SP / .;
+
+# LB 14
+$SP+ $CM* $OP;
+
+# LB 15
+$SP+ $CM* $QU;
+
+# LB 16
+$SP+ $CM* ($CL | $CP);
+
+# LB 17
+$SP+ $CM* $B2;
+
+# LB 21
+$CM* ($HY | $BA) $CM* $HL;
+
+# LB 25
+($CM* ($IS | $SY))+ $CM* $NU;
+($CL | $CP) $CM* ($NU | $IS | $SY);
+
+# For dictionary-based break
+$dictionary $dictionary;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# Skip forward over all character classes that are involved in
+# rules containing patterns with possibly more than one char
+# of context.
+#
+# It might be slightly more efficient to have specific rules
+# instead of one generic one, but only if we could
+# turn off rule chaining. We don't want to move more
+# than necessary.
+#
+[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
+$dictionary $dictionary;
+
--- /dev/null
+# Copyright (c) 2002-2015 International Business Machines Corporation and
+# others. All Rights Reserved.
+#
+# file: line_loose_cj.txt
+#
+# Line Breaking Rules
+# Implement default line breaking as defined by
+# Unicode Standard Annex #14 Revision 29 for Unicode 6.2
+# http://www.unicode.org/reports/tr14/
+# tailored as noted in 2nd paragraph below..
+#
+# TODO: Rule LB 8 remains as it was in Unicode 5.2
+# This is only because of a limitation of ICU break engine implementation,
+# not because the older behavior is desirable.
+#
+# This tailors the line break behavior to correspond to CSS
+# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
+# It sets characters of class CJ to behave like ID.
+# In addition, it allows breaks:
+# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
+# * between characters of LineBreak class IN such as 2026
+# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
+# FF65 (all NS) and FF01, FF1F (both EX).
+# * before suffix characters with LineBreak class PO and EastAsianWidth A,F,W;
+# this includes: 00B0 2030 2032 2033 2035 2103 2109 FE6A FF05 FFE0
+# * after prefix characters with LineBreak class PR and EastAsianWidth A,F,W;
+# this includes: 00A4 00B1 20AC 2116 FE69 FF04 FFE1 FFE5 FFE6
+
+
+#
+# Character Classes defined by TR 14.
+#
+
+!!chain;
+!!LBCMNoChain;
+
+
+!!lookAheadHardBreak;
+#
+# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
+# and only used for the line break rules.
+#
+# It is used in the implementation of rule LB 10
+# which says to treat any combining mark that is not attached to a base
+# character as if it were of class AL (alphabetic).
+#
+# The problem occurs in the reverse rules.
+#
+# Consider a sequence like, with correct breaks as shown
+# LF ID CM AL AL
+# ^ ^ ^
+# Then consider the sequence without the initial ID (ideographic)
+# LF CM AL AL
+# ^ ^
+# Our CM, which in the first example was attached to the ideograph,
+# is now unattached, becomes an alpha, and joins in with the other
+# alphas.
+#
+# When iterating forwards, these sequences do not present any problems
+# When iterating backwards, we need to look ahead when encountering
+# a CM to see whether it attaches to something further on or not.
+# (Look-ahead in a reverse rule is looking towards the start)
+#
+# If the CM is unattached, we need to force a break.
+#
+# !!lookAheadHardBreak forces the run time state machine to
+# stop immediately when a look ahead rule ( '/' operator) matches,
+# and set the match position to that of the look-ahead operator,
+# no matter what other rules may be in play at the time.
+#
+# See rule LB 19 for an example.
+#
+
+$AI = [:LineBreak = Ambiguous:];
+$AL = [:LineBreak = Alphabetic:];
+$BAX = [\u2010 \u2013];
+$BA = [[:LineBreak = Break_After:] - $BAX];
+$BB = [:LineBreak = Break_Before:];
+$BK = [:LineBreak = Mandatory_Break:];
+$B2 = [:LineBreak = Break_Both:];
+$CB = [:LineBreak = Contingent_Break:];
+$CJ = [:LineBreak = Conditional_Japanese_Starter:];
+$CL = [:LineBreak = Close_Punctuation:];
+$CM = [:LineBreak = Combining_Mark:];
+$CP = [:LineBreak = Close_Parenthesis:];
+$CR = [:LineBreak = Carriage_Return:];
+$EXX = [\uFF01 \uFF1F];
+$EX = [[:LineBreak = Exclamation:] - $EXX];
+$GL = [:LineBreak = Glue:];
+$HL = [:LineBreak = Hebrew_Letter:];
+$HY = [:LineBreak = Hyphen:];
+$H2 = [:LineBreak = H2:];
+$H3 = [:LineBreak = H3:];
+$ID = [[:LineBreak = Ideographic:] $CJ];
+$IN = [:LineBreak = Inseperable:];
+$IS = [:LineBreak = Infix_Numeric:];
+$JL = [:LineBreak = JL:];
+$JV = [:LineBreak = JV:];
+$JT = [:LineBreak = JT:];
+$LF = [:LineBreak = Line_Feed:];
+$NL = [:LineBreak = Next_Line:];
+$NSX = [\u301C \u30A0 \u3005 \u303B \u309D \u309E \u30FD \u30FE \u203C \u2047 \u2048 \u2049 \u30FB \uFF1A \uFF1B \uFF65];
+$NS = [[:LineBreak = Nonstarter:] - $NSX];
+$NU = [:LineBreak = Numeric:];
+$OP = [:LineBreak = Open_Punctuation:];
+$POX = [\u00B0 \u2030 \u2032 \u2033 \u2035 \u2103 \u2109 \uFE6A \uFF05 \uFFE0];
+$PO = [[:LineBreak = Postfix_Numeric:] - $POX];
+$PRX = [\u00A4 \u00B1 \u20AC \u2116 \uFE69 \uFF04 \uFFE1 \uFFE5 \uFFE6];
+$PR = [[:LineBreak = Prefix_Numeric:] - $PRX];
+$QU = [:LineBreak = Quotation:];
+$RI = [:LineBreak = Regional_Indicator:];
+$SA = [:LineBreak = Complex_Context:];
+$SG = [:LineBreak = Surrogate:];
+$SP = [:LineBreak = Space:];
+$SY = [:LineBreak = Break_Symbols:];
+$WJ = [:LineBreak = Word_Joiner:];
+$XX = [:LineBreak = Unknown:];
+$ZW = [:LineBreak = ZWSpace:];
+
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
+# Dictionary character set, for triggering language-based break engines. Currently
+# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+# 5.0 or later as the definition of Complex_Context was corrected to include all
+# characters requiring dictionary break.
+
+$dictionary = [:LineBreak = Complex_Context:];
+
+#
+# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
+# SA (South East Asian: Thai, Lao, Khmer)
+# SG (Unpaired Surrogates)
+# XX (Unknown, unassigned)
+# as $AL (Alphabetic)
+#
+$ALPlus = [$AL $AI $SA $SG $XX];
+
+#
+# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
+#
+$ALcm = $ALPlus $CM*;
+$BAcm = $BA $CM*;
+$BAXcm = $BAX $CM*;
+$BBcm = $BB $CM*;
+$B2cm = $B2 $CM*;
+$CLcm = $CL $CM*;
+$CPcm = $CP $CM*;
+$EXcm = $EX $CM*;
+$EXXcm = $EXX $CM*;
+$GLcm = $GL $CM*;
+$HLcm = $HL $CM*;
+$HYcm = $HY $CM*;
+$H2cm = $H2 $CM*;
+$H3cm = $H3 $CM*;
+$IDcm = $ID $CM*;
+$INcm = $IN $CM*;
+$IScm = $IS $CM*;
+$JLcm = $JL $CM*;
+$JVcm = $JV $CM*;
+$JTcm = $JT $CM*;
+$NScm = $NS $CM*;
+$NSXcm = $NSX $CM*;
+$NUcm = $NU $CM*;
+$OPcm = $OP $CM*;
+$POcm = $PO $CM*;
+$POXcm = $POX $CM*;
+$PRcm = $PR $CM*;
+$PRXcm = $PRX $CM*;
+$QUcm = $QU $CM*;
+$RIcm = $RI $CM*;
+$SYcm = $SY $CM*;
+$WJcm = $WJ $CM*;
+
+## -------------------------------------------------
+
+!!forward;
+
+#
+# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
+#
+$ALPlus $CM+;
+$BA $CM+;
+$BAX $CM+;
+$BB $CM+;
+$B2 $CM+;
+$CL $CM+;
+$CP $CM+;
+$EX $CM+;
+$EXX $CM+;
+$GL $CM+;
+$HL $CM+;
+$HY $CM+;
+$H2 $CM+;
+$H3 $CM+;
+$ID $CM+;
+$IN $CM+;
+$IS $CM+;
+$JL $CM+;
+$JV $CM+;
+$JT $CM+;
+$NS $CM+;
+$NSX $CM+;
+$NU $CM+;
+$OP $CM+;
+$PO $CM+;
+$POX $CM+;
+$PR $CM+;
+$PRX $CM+;
+$QU $CM+;
+$RI $CM+;
+$SY $CM+;
+$WJ $CM+;
+
+#
+# CAN_CM is the set of characters that may combine with CM combining chars.
+# Note that Linebreak UAX 14's concept of a combining char and the rules
+# for what they can combine with are _very_ different from the rest of Unicode.
+#
+# Note that $CM itself is left out of this set. If CM is needed as a base
+# it must be listed separately in the rule.
+#
+$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs
+$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
+
+#
+# AL_FOLLOW set of chars that can unconditionally follow an AL
+# Needed in rules where stand-alone $CM s are treated as AL.
+# Chaining is disabled with CM because it causes other failures,
+# so for this one case we need to manually list out longer sequences.
+#
+$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
+$AL_FOLLOW_CM = [$CL $CP $EX $EXX $HL $IS $SY $WJ $GL $OP $QU $BA $BAX $HY $NS $NSX $IN $NU $ALPlus];
+$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
+
+
+#
+# Rule LB 4, 5 Mandatory (Hard) breaks.
+#
+$LB4Breaks = [$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL];
+$CR $LF {100};
+
+#
+# LB 6 Do not break before hard line breaks.
+#
+$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
+$CAN_CM $CM* $LB4Breaks {100};
+$CM+ $LB4Breaks {100};
+
+# LB 7 x SP
+# x ZW
+$LB4NonBreaks [$SP $ZW];
+$CAN_CM $CM* [$SP $ZW];
+$CM+ [$SP $ZW];
+
+#
+# LB 8 Break after zero width space
+# TODO: ZW SP* <break>
+# An engine change is required to write the reverse rule for this.
+# For now, leave the Unicode 5.2 rule, ZW <break>
+#
+$LB8Breaks = [$LB4Breaks $ZW];
+$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+
+
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
+# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
+# $CM not covered by the above needs to behave like $AL
+# See definition of $CAN_CM.
+
+$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
+$CM+;
+
+#
+# LB 11 Do not break before or after WORD JOINER & related characters.
+#
+$CAN_CM $CM* $WJcm;
+$LB8NonBreaks $WJcm;
+$CM+ $WJcm;
+
+$WJcm $CANT_CM;
+$WJcm $CAN_CM $CM*;
+
+#
+# LB 12 Do not break after NBSP and related characters.
+# GL x
+#
+$GLcm $CAN_CM $CM*;
+$GLcm $CANT_CM;
+
+#
+# LB 12a Do not break before NBSP and related characters ...
+# [^SP BA HY] x GL
+#
+[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm;
+$CM+ GLcm;
+
+
+
+#
+# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces.
+#
+# Do not include $EXX here
+$LB8NonBreaks $CL;
+$CAN_CM $CM* $CL;
+$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $CP;
+$CAN_CM $CM* $CP;
+$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $EX;
+$CAN_CM $CM* $EX;
+$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $IS;
+$CAN_CM $CM* $IS;
+$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $SY;
+$CAN_CM $CM* $SY;
+$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
+
+
+#
+# LB 14 Do not break after OP, even after spaces
+#
+$OPcm $SP* $CAN_CM $CM*;
+$OPcm $SP* $CANT_CM;
+
+$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
+
+# LB 15
+$QUcm $SP* $OPcm;
+
+# LB 16
+# Do not break between closing punctuation and $NS, even with intervening spaces
+# But DO allow a break between closing punctuation and $NSX, don't include it here
+($CLcm | $CPcm) $SP* $NScm;
+
+# LB 17
+$B2cm $SP* $B2cm;
+
+#
+# LB 18 Break after spaces.
+#
+$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
+$LB18Breaks = [$LB8Breaks $SP];
+
+
+# LB 19
+# x QU
+$LB18NonBreaks $CM* $QUcm;
+$CM+ $QUcm;
+
+# QU x
+$QUcm .?;
+$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
+ # TODO: I don't think this rule is needed.
+
+
+# LB 20
+# <break> $CB
+# $CB <break>
+
+$LB20NonBreaks = [$LB18NonBreaks - $CB];
+
+# LB 21 x (BA | HY | NS)
+# BB x
+#
+# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
+$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
+
+$BBcm [^$CB]; # $BB x
+$BBcm $LB20NonBreaks $CM*;
+
+# LB 21a Don't break after Hebrew + Hyphen
+# HL (HY | BA) x
+#
+$HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?;
+
+# LB 21b (forward) Don't break between SY and HL
+# (break between HL and SY already disallowed by LB 13 above)
+$SYcm $HLcm;
+
+# LB 22
+($ALcm | $HLcm) $INcm;
+$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
+$IDcm $INcm;
+# $INcm $INcm; # delete this rule for CSS loose
+$NUcm $INcm;
+
+
+# LB 23
+# Do not include $POX here
+$IDcm $POcm;
+$ALcm $NUcm; # includes $LB19
+$HLcm $NUcm;
+$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
+$NUcm $ALcm;
+$NUcm $HLcm;
+
+#
+# LB 24
+#
+# Do not include $PRX here
+$PRcm $IDcm;
+$PRcm ($ALcm | $HLcm);
+($POcm | $POXcm) ($ALcm | $HLcm);
+
+#
+# LB 25 Numbers.
+#
+# Here do not include $PRX at the beginning or $POX at the end
+($PRcm | $POcm | $POXcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $PRXcm | $POcm)?;
+
+# LB 26 Do not break a Korean syllable
+#
+$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
+($JVcm | $H2cm) ($JVcm | $JTcm);
+($JTcm | $H3cm) $JTcm;
+
+# LB 27 Treat korean Syllable Block the same as ID (don't break it)
+# Do not include $POX or $PRX here
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
+$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
+
+
+# LB 28 Do not break between alphabetics
+#
+($ALcm | $HLcm) ($ALcm | $HLcm);
+$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
+
+# LB 29
+$IScm ($ALcm | $HLcm);
+
+# LB 30
+($ALcm | $HLcm | $NUcm) $OPcm;
+$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
+$CPcm ($ALcm | $HLcm | $NUcm);
+
+# LB 30a Do not break between regional indicators.
+$RIcm $RIcm;
+
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
+#
+# Reverse Rules.
+#
+## -------------------------------------------------
+
+!!reverse;
+
+$CM+ $ALPlus;
+$CM+ $BA;
+$CM+ $BAX;
+$CM+ $BB;
+$CM+ $B2;
+$CM+ $CL;
+$CM+ $CP;
+$CM+ $EX;
+$CM+ $EXX;
+$CM+ $GL;
+$CM+ $HL;
+$CM+ $HY;
+$CM+ $H2;
+$CM+ $H3;
+$CM+ $ID;
+$CM+ $IN;
+$CM+ $IS;
+$CM+ $JL;
+$CM+ $JV;
+$CM+ $JT;
+$CM+ $NS;
+$CM+ $NSX;
+$CM+ $NU;
+$CM+ $OP;
+$CM+ $PO;
+$CM+ $POX;
+$CM+ $PR;
+$CM+ $PRX;
+$CM+ $QU;
+$CM+ $RI;
+$CM+ $SY;
+$CM+ $WJ;
+$CM+;
+
+
+#
+# Sequences of the form (shown forwards)
+# [CANT_CM] <break> [CM] [whatever]
+# The CM needs to behave as an AL
+#
+$AL_FOLLOW $CM+ / (
+ [$BK $CR $LF $NL $ZW {eof}] |
+ $SP+ $CM+ $SP |
+ $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
+ # LB14 says OP SP* x .
+ # becomes OP SP* x AL
+ # becomes OP SP* x CM+ AL_FOLLOW
+ #
+ # Further note: the $AL in [$AL {eof}] is only to work around
+ # a rule compiler bug which complains about
+ # empty sets otherwise.
+
+#
+# Sequences of the form (shown forwards)
+# [CANT_CM] <break> [CM] <break> [PR]
+# The CM needs to behave as an AL
+# This rule is concerned about getting the second of the two <breaks> in place.
+#
+
+[$PR $PRX ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
+
+
+
+# LB 4, 5, 5
+
+$LB4Breaks [$LB4NonBreaks-$CM];
+$LB4Breaks $CM+ $CAN_CM;
+$LF $CR;
+
+
+# LB 7 x SP
+# x ZW
+[$SP $ZW] [$LB4NonBreaks-$CM];
+[$SP $ZW] $CM+ $CAN_CM;
+
+# LB 8 ZW SP* <break>
+# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
+# Requires an engine enhancement.
+# / $SP* $ZW
+
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
+# LB 9,10 Combining marks.
+# X $CM needs to behave like X, where X is not $SP or controls.
+# $CM not covered by the above needs to behave like $AL
+# Stick together any combining sequences that don't match other rules.
+$CM+ $CAN_CM;
+
+
+# LB 11
+$CM* $WJ $CM* $CAN_CM;
+$CM* $WJ [$LB8NonBreaks-$CM];
+
+ $CANT_CM $CM* $WJ;
+$CM* $CAN_CM $CM* $WJ;
+
+# LB 12a
+# [^SP BA HY] x GL
+#
+$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
+
+# LB 12
+# GL x
+#
+$CANT_CM $CM* $GL;
+$CM* $CAN_CM $CM* $GL;
+
+
+# LB 13
+# Do not include $EXX here
+$CL $CM+ $CAN_CM;
+$CP $CM+ $CAN_CM;
+$EX $CM+ $CAN_CM;
+$IS $CM+ $CAN_CM;
+$SY $CM+ $CAN_CM;
+
+$CL [$LB8NonBreaks-$CM];
+$CP [$LB8NonBreaks-$CM];
+$EX [$LB8NonBreaks-$CM];
+$IS [$LB8NonBreaks-$CM];
+$SY [$LB8NonBreaks-$CM];
+
+# Rule 13 & 14 taken together for an edge case.
+# Match this, shown forward
+# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
+# This really wants to chain at the $CM+ (which is acting as an $AL)
+# except for $CM chaining being disabled.
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
+
+# LB 14 OP SP* x
+#
+$CM* $CAN_CM $SP* $CM* $OP;
+ $CANT_CM $SP* $CM* $OP;
+$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
+
+ $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
+$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
+
+
+
+# LB 15
+$CM* $OP $SP* $CM* $QU;
+
+# LB 16
+# Don't include $NSX here
+$CM* $NS $SP* $CM* ($CL | $CP);
+
+# LB 17
+$CM* $B2 $SP* $CM* $B2;
+
+# LB 18 break after spaces
+# Nothing explicit needed here.
+
+
+#
+# LB 19
+#
+$CM* $QU $CM* $CAN_CM; # . x QU
+$CM* $QU $LB18NonBreaks;
+
+
+$CM* $CAN_CM $CM* $QU; # QU x .
+ $CANT_CM $CM* $QU;
+
+#
+# LB 20 Break before and after CB.
+# nothing needed here.
+#
+
+# LB 21
+# Don't include $BAX or $NSX here
+$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
+
+$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
+[^$CB] $CM* $BB; #
+
+# LB21a
+[^$CB] $CM* ($HY | $BA | $BAX) $CM* $HL;
+
+# LB21b (reverse)
+$CM* $HL $CM* $SY;
+
+# LB 22
+$CM* $IN $CM* ($ALPlus | $HL);
+$CM* $IN $CM* $ID;
+# $CM* $IN $CM* $IN; # delete this rule for CSS loose
+$CM* $IN $CM* $NU;
+
+# LB 23
+# Do not include $POX here
+$CM* $PO $CM* $ID;
+$CM* $NU $CM* ($ALPlus | $HL);
+$CM* ($ALPlus | $HL) $CM* $NU;
+
+# LB 24
+# Do not include $PRX here
+$CM* $ID $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* ($PO | $POX);
+
+
+# LB 25
+# Here do not include $POX at the beginning or $PRX at the end
+($CM* ($PR | $PRX | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO | $POX))?;
+
+# LB 26
+$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
+$CM* ($JT | $JV) $CM* ($H2 | $JV);
+$CM* $JT $CM* ($H3 | $JT);
+
+# LB 27
+# Do not include $POX or $PRX here
+$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+
+# LB 28
+$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+
+
+# LB 29
+$CM* ($ALPlus | $HL) $CM* $IS;
+
+# LB 30
+$CM* $OP $CM* ($ALPlus | $HL | $NU);
+$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+
+# LB 30a
+$CM* $RI $CM* $RI;
+
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# LB 9
+$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+$CM+ $SP / .;
+
+# LB 14
+$SP+ $CM* $OP;
+
+# LB 15
+$SP+ $CM* $QU;
+
+# LB 16
+$SP+ $CM* ($CL | $CP);
+
+# LB 17
+$SP+ $CM* $B2;
+
+# LB 21
+$CM* ($HY | $BA) $CM* $HL;
+
+# LB 25
+($CM* ($IS | $SY))+ $CM* $NU;
+($CL | $CP) $CM* ($NU | $IS | $SY);
+
+# For dictionary-based break
+$dictionary $dictionary;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# Skip forward over all character classes that are involved in
+# rules containing patterns with possibly more than one char
+# of context.
+#
+# It might be slightly more efficient to have specific rules
+# instead of one generic one, but only if we could
+# turn off rule chaining. We don't want to move more
+# than necessary.
+#
+[$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $dictionary];
+$dictionary $dictionary;
+
--- /dev/null
+# Copyright (c) 2002-2015 International Business Machines Corporation and
+# others. All Rights Reserved.
+#
+# file: line_loose_fi.txt
+#
+# Line Breaking Rules
+# Implement default line breaking as defined by
+# Unicode Standard Annex #14 Revision 29 for Unicode 6.2
+# http://www.unicode.org/reports/tr14/
+# tailored as noted in 2nd paragraph below..
+#
+# TODO: Rule LB 8 remains as it was in Unicode 5.2
+# This is only because of a limitation of ICU break engine implementation,
+# not because the older behavior is desirable.
+#
+# This tailors the line break behavior both for Finnish and to correpond to CSS
+# line-break=loose (BCP47 -u-lb-loose) as defined for languages other than
+# Chinese & Japanese.
+# It sets characters of class CJ to behave like ID.
+# In addition, it allows breaks before 3005, 303B, 309D, 309E, 30FD, 30FE (all NS).
+
+#
+# Character Classes defined by TR 14.
+#
+
+!!chain;
+!!LBCMNoChain;
+
+
+!!lookAheadHardBreak;
+#
+# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
+# and only used for the line break rules.
+#
+# It is used in the implementation of rule LB 10
+# which says to treat any combining mark that is not attached to a base
+# character as if it were of class AL (alphabetic).
+#
+# The problem occurs in the reverse rules.
+#
+# Consider a sequence like, with correct breaks as shown
+# LF ID CM AL AL
+# ^ ^ ^
+# Then consider the sequence without the initial ID (ideographic)
+# LF CM AL AL
+# ^ ^
+# Our CM, which in the first example was attached to the ideograph,
+# is now unattached, becomes an alpha, and joins in with the other
+# alphas.
+#
+# When iterating forwards, these sequences do not present any problems
+# When iterating backwards, we need to look ahead when encountering
+# a CM to see whether it attaches to something further on or not.
+# (Look-ahead in a reverse rule is looking towards the start)
+#
+# If the CM is unattached, we need to force a break.
+#
+# !!lookAheadHardBreak forces the run time state machine to
+# stop immediately when a look ahead rule ( '/' operator) matches,
+# and set the match position to that of the look-ahead operator,
+# no matter what other rules may be in play at the time.
+#
+# See rule LB 19 for an example.
+#
+
+$AI = [:LineBreak = Ambiguous:];
+$AL = [:LineBreak = Alphabetic:];
+$BA = [[:LineBreak = Break_After:] - [\u2010]];
+$HH = [\u2010];
+$BB = [:LineBreak = Break_Before:];
+$BK = [:LineBreak = Mandatory_Break:];
+$B2 = [:LineBreak = Break_Both:];
+$CB = [:LineBreak = Contingent_Break:];
+$CJ = [:LineBreak = Conditional_Japanese_Starter:];
+$CL = [:LineBreak = Close_Punctuation:];
+$CM = [:LineBreak = Combining_Mark:];
+$CP = [:LineBreak = Close_Parenthesis:];
+$CR = [:LineBreak = Carriage_Return:];
+$EX = [:LineBreak = Exclamation:];
+$GL = [:LineBreak = Glue:];
+$HL = [:LineBreak = Hebrew_Letter:];
+$HY = [:LineBreak = Hyphen:];
+$H2 = [:LineBreak = H2:];
+$H3 = [:LineBreak = H3:];
+$ID = [[:LineBreak = Ideographic:] $CJ];
+$IN = [:LineBreak = Inseperable:];
+$IS = [:LineBreak = Infix_Numeric:];
+$JL = [:LineBreak = JL:];
+$JV = [:LineBreak = JV:];
+$JT = [:LineBreak = JT:];
+$LF = [:LineBreak = Line_Feed:];
+$NL = [:LineBreak = Next_Line:];
+$NSX = [\u3005 \u303B \u309D \u309E \u30FD \u30FE];
+$NS = [[:LineBreak = Nonstarter:] - $NSX];
+$NU = [:LineBreak = Numeric:];
+$OP = [:LineBreak = Open_Punctuation:];
+$PO = [:LineBreak = Postfix_Numeric:];
+$PR = [:LineBreak = Prefix_Numeric:];
+$QU = [:LineBreak = Quotation:];
+$RI = [:LineBreak = Regional_Indicator:];
+$SA = [:LineBreak = Complex_Context:];
+$SG = [:LineBreak = Surrogate:];
+$SP = [:LineBreak = Space:];
+$SY = [:LineBreak = Break_Symbols:];
+$WJ = [:LineBreak = Word_Joiner:];
+$XX = [:LineBreak = Unknown:];
+$ZW = [:LineBreak = ZWSpace:];
+
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
+# Dictionary character set, for triggering language-based break engines. Currently
+# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+# 5.0 or later as the definition of Complex_Context was corrected to include all
+# characters requiring dictionary break.
+
+$dictionary = [:LineBreak = Complex_Context:];
+
+#
+# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
+# SA (South East Asian: Thai, Lao, Khmer)
+# SG (Unpaired Surrogates)
+# XX (Unknown, unassigned)
+# as $AL (Alphabetic)
+#
+$ALPlus = [$AL $AI $SA $SG $XX];
+
+#
+# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
+#
+$ALcm = $ALPlus $CM*;
+$BAcm = $BA $CM*;
+$HHcm = $HH $CM*;
+$BBcm = $BB $CM*;
+$B2cm = $B2 $CM*;
+$CLcm = $CL $CM*;
+$CPcm = $CP $CM*;
+$EXcm = $EX $CM*;
+$GLcm = $GL $CM*;
+$HLcm = $HL $CM*;
+$HYcm = $HY $CM*;
+$H2cm = $H2 $CM*;
+$H3cm = $H3 $CM*;
+$IDcm = $ID $CM*;
+$INcm = $IN $CM*;
+$IScm = $IS $CM*;
+$JLcm = $JL $CM*;
+$JVcm = $JV $CM*;
+$JTcm = $JT $CM*;
+$NScm = $NS $CM*;
+$NSXcm = $NSX $CM*;
+$NUcm = $NU $CM*;
+$OPcm = $OP $CM*;
+$POcm = $PO $CM*;
+$PRcm = $PR $CM*;
+$QUcm = $QU $CM*;
+$RIcm = $RI $CM*;
+$SYcm = $SY $CM*;
+$WJcm = $WJ $CM*;
+
+## -------------------------------------------------
+
+!!forward;
+
+#
+# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
+#
+$ALPlus $CM+;
+$BA $CM+;
+$HH $CM+;
+$BB $CM+;
+$B2 $CM+;
+$CL $CM+;
+$CP $CM+;
+$EX $CM+;
+$GL $CM+;
+$HL $CM+;
+$HY $CM+;
+$H2 $CM+;
+$H3 $CM+;
+$ID $CM+;
+$IN $CM+;
+$IS $CM+;
+$JL $CM+;
+$JV $CM+;
+$JT $CM+;
+$NS $CM+;
+$NSX $CM+;
+$NU $CM+;
+$OP $CM+;
+$PO $CM+;
+$PR $CM+;
+$QU $CM+;
+$RI $CM+;
+$SY $CM+;
+$WJ $CM+;
+
+#
+# CAN_CM is the set of characters that may combine with CM combining chars.
+# Note that Linebreak UAX 14's concept of a combining char and the rules
+# for what they can combine with are _very_ different from the rest of Unicode.
+#
+# Note that $CM itself is left out of this set. If CM is needed as a base
+# it must be listed separately in the rule.
+#
+$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs
+$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
+
+#
+# AL_FOLLOW set of chars that can unconditionally follow an AL
+# Needed in rules where stand-alone $CM s are treated as AL.
+# Chaining is disabled with CM because it causes other failures,
+# so for this one case we need to manually list out longer sequences.
+#
+$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
+$AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $NSX $IN $NU $ALPlus];
+$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
+
+
+#
+# Rule LB 4, 5 Mandatory (Hard) breaks.
+#
+$LB4Breaks = [$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL];
+$CR $LF {100};
+
+#
+# LB 6 Do not break before hard line breaks.
+#
+$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
+$CAN_CM $CM* $LB4Breaks {100};
+$CM+ $LB4Breaks {100};
+
+# LB 7 x SP
+# x ZW
+$LB4NonBreaks [$SP $ZW];
+$CAN_CM $CM* [$SP $ZW];
+$CM+ [$SP $ZW];
+
+#
+# LB 8 Break after zero width space
+# TODO: ZW SP* <break>
+# An engine change is required to write the reverse rule for this.
+# For now, leave the Unicode 5.2 rule, ZW <break>
+#
+$LB8Breaks = [$LB4Breaks $ZW];
+$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+
+
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
+# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
+# $CM not covered by the above needs to behave like $AL
+# See definition of $CAN_CM.
+
+$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
+$CM+;
+
+#
+# LB 11 Do not break before or after WORD JOINER & related characters.
+#
+$CAN_CM $CM* $WJcm;
+$LB8NonBreaks $WJcm;
+$CM+ $WJcm;
+
+$WJcm $CANT_CM;
+$WJcm $CAN_CM $CM*;
+
+#
+# LB 12 Do not break after NBSP and related characters.
+# GL x
+#
+$GLcm $CAN_CM $CM*;
+$GLcm $CANT_CM;
+
+#
+# LB 12a Do not break before NBSP and related characters ...
+# [^SP BA HY] x GL
+#
+[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm;
+$CM+ GLcm;
+
+
+
+#
+# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces.
+#
+$LB8NonBreaks $CL;
+$CAN_CM $CM* $CL;
+$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $CP;
+$CAN_CM $CM* $CP;
+$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $EX;
+$CAN_CM $CM* $EX;
+$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $IS;
+$CAN_CM $CM* $IS;
+$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $SY;
+$CAN_CM $CM* $SY;
+$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
+
+
+#
+# LB 14 Do not break after OP, even after spaces
+#
+$OPcm $SP* $CAN_CM $CM*;
+$OPcm $SP* $CANT_CM;
+
+$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
+
+# LB 15
+$QUcm $SP* $OPcm;
+
+# LB 16
+# Do not break between closing punctuation and $NS, even with intervening spaces
+# But DO allow a break between closing punctuation and $NSX, don't include it here
+($CLcm | $CPcm) $SP* $NScm;
+
+# LB 17
+$B2cm $SP* $B2cm;
+
+#
+# LB 18 Break after spaces.
+#
+$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
+$LB18Breaks = [$LB8Breaks $SP];
+
+
+# LB 19
+# x QU
+$LB18NonBreaks $CM* $QUcm;
+$CM+ $QUcm;
+
+# QU x
+$QUcm .?;
+$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
+ # TODO: I don't think this rule is needed.
+
+
+# LB 20
+# <break> $CB
+# $CB <break>
+
+$LB20NonBreaks = [$LB18NonBreaks - $CB];
+
+# LB 20.09 added rule for Finnish tailoring
+# LB 21 x (BA | HY | NS)
+# BB x
+#
+# DO allow breaks here before NSXcm, so don't include it
+$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL;
+$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm);
+($HY | $HH) $AL;
+
+$BBcm [^$CB]; # $BB x
+$BBcm $LB20NonBreaks $CM*;
+
+# LB 21a Don't break after Hebrew + Hyphen
+# HL (HY | BA) x
+#
+$HLcm ($HYcm | $BAcm | $HHcm) [^$CB]?;
+
+# LB 21b (forward) Don't break between SY and HL
+# (break between HL and SY already disallowed by LB 13 above)
+$SYcm $HLcm;
+
+# LB 22
+($ALcm | $HLcm) $INcm;
+$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
+$IDcm $INcm;
+$INcm $INcm;
+$NUcm $INcm;
+
+
+# $LB 23
+$IDcm $POcm;
+$ALcm $NUcm; # includes $LB19
+$HLcm $NUcm;
+$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
+$NUcm $ALcm;
+$NUcm $HLcm;
+
+#
+# LB 24
+#
+$PRcm $IDcm;
+$PRcm ($ALcm | $HLcm);
+$POcm ($ALcm | $HLcm);
+
+#
+# LB 25 Numbers.
+#
+($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
+
+# LB 26 Do not break a Korean syllable
+#
+$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
+($JVcm | $H2cm) ($JVcm | $JTcm);
+($JTcm | $H3cm) $JTcm;
+
+# LB 27 Treat korean Syllable Block the same as ID (don't break it)
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
+$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
+
+
+# LB 28 Do not break between alphabetics
+#
+($ALcm | $HLcm) ($ALcm | $HLcm);
+$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
+
+# LB 29
+$IScm ($ALcm | $HLcm);
+
+# LB 30
+($ALcm | $HLcm | $NUcm) $OPcm;
+$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
+$CPcm ($ALcm | $HLcm | $NUcm);
+
+# LB 30a Do not break between regional indicators.
+$RIcm $RIcm;
+
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
+#
+# Reverse Rules.
+#
+## -------------------------------------------------
+
+!!reverse;
+
+$CM+ $ALPlus;
+$CM+ $BA;
+$CM+ $HH;
+$CM+ $BB;
+$CM+ $B2;
+$CM+ $CL;
+$CM+ $CP;
+$CM+ $EX;
+$CM+ $GL;
+$CM+ $HL;
+$CM+ $HY;
+$CM+ $H2;
+$CM+ $H3;
+$CM+ $ID;
+$CM+ $IN;
+$CM+ $IS;
+$CM+ $JL;
+$CM+ $JV;
+$CM+ $JT;
+$CM+ $NS;
+$CM+ $NSX;
+$CM+ $NU;
+$CM+ $OP;
+$CM+ $PO;
+$CM+ $PR;
+$CM+ $QU;
+$CM+ $RI;
+$CM+ $SY;
+$CM+ $WJ;
+$CM+;
+
+
+#
+# Sequences of the form (shown forwards)
+# [CANT_CM] <break> [CM] [whatever]
+# The CM needs to behave as an AL
+#
+$AL_FOLLOW $CM+ / (
+ [$BK $CR $LF $NL $ZW {eof}] |
+ $SP+ $CM+ $SP |
+ $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
+ # LB14 says OP SP* x .
+ # becomes OP SP* x AL
+ # becomes OP SP* x CM+ AL_FOLLOW
+ #
+ # Further note: the $AL in [$AL {eof}] is only to work around
+ # a rule compiler bug which complains about
+ # empty sets otherwise.
+
+#
+# Sequences of the form (shown forwards)
+# [CANT_CM] <break> [CM] <break> [PR]
+# The CM needs to behave as an AL
+# This rule is concerned about getting the second of the two <breaks> in place.
+#
+
+[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
+
+
+
+# LB 4, 5, 5
+
+$LB4Breaks [$LB4NonBreaks-$CM];
+$LB4Breaks $CM+ $CAN_CM;
+$LF $CR;
+
+
+# LB 7 x SP
+# x ZW
+[$SP $ZW] [$LB4NonBreaks-$CM];
+[$SP $ZW] $CM+ $CAN_CM;
+
+# LB 8 ZW SP* <break>
+# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
+# Requires an engine enhancement.
+# / $SP* $ZW
+
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
+# LB 9,10 Combining marks.
+# X $CM needs to behave like X, where X is not $SP or controls.
+# $CM not covered by the above needs to behave like $AL
+# Stick together any combining sequences that don't match other rules.
+$CM+ $CAN_CM;
+
+
+# LB 11
+$CM* $WJ $CM* $CAN_CM;
+$CM* $WJ [$LB8NonBreaks-$CM];
+
+ $CANT_CM $CM* $WJ;
+$CM* $CAN_CM $CM* $WJ;
+
+# LB 12a
+# [^SP BA HY] x GL
+#
+$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
+
+# LB 12
+# GL x
+#
+$CANT_CM $CM* $GL;
+$CM* $CAN_CM $CM* $GL;
+
+
+# LB 13
+$CL $CM+ $CAN_CM;
+$CP $CM+ $CAN_CM;
+$EX $CM+ $CAN_CM;
+$IS $CM+ $CAN_CM;
+$SY $CM+ $CAN_CM;
+
+$CL [$LB8NonBreaks-$CM];
+$CP [$LB8NonBreaks-$CM];
+$EX [$LB8NonBreaks-$CM];
+$IS [$LB8NonBreaks-$CM];
+$SY [$LB8NonBreaks-$CM];
+
+# Rule 13 & 14 taken together for an edge case.
+# Match this, shown forward
+# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
+# This really wants to chain at the $CM+ (which is acting as an $AL)
+# except for $CM chaining being disabled.
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
+
+# LB 14 OP SP* x
+#
+$CM* $CAN_CM $SP* $CM* $OP;
+ $CANT_CM $SP* $CM* $OP;
+$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
+
+ $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
+$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
+
+
+
+# LB 15
+$CM* $OP $SP* $CM* $QU;
+
+# LB 16
+# Don't include $NSX here
+$CM* $NS $SP* $CM* ($CL | $CP);
+
+# LB 17
+$CM* $B2 $SP* $CM* $B2;
+
+# LB 18 break after spaces
+# Nothing explicit needed here.
+
+
+#
+# LB 19
+#
+$CM* $QU $CM* $CAN_CM; # . x QU
+$CM* $QU $LB18NonBreaks;
+
+
+$CM* $CAN_CM $CM* $QU; # QU x .
+ $CANT_CM $CM* $QU;
+
+#
+# LB 20 Break before and after CB.
+# nothing needed here.
+#
+
+# LB 20.09 added rule for Finnish tailoring
+$AL ($HY | $HH) / $SP;
+
+# LB 21
+# Don't include $NSX here
+$CM* ($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
+
+$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
+[^$CB] $CM* $BB; #
+
+# LB21a
+[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL;
+
+# LB21b (reverse)
+$CM* $HL $CM* $SY;
+
+# LB 22
+$CM* $IN $CM* ($ALPlus | $HL);
+$CM* $IN $CM* $ID;
+$CM* $IN $CM* $IN;
+$CM* $IN $CM* $NU;
+
+# LB 23
+$CM* $PO $CM* $ID;
+$CM* $NU $CM* ($ALPlus | $HL);
+$CM* ($ALPlus | $HL) $CM* $NU;
+
+# LB 24
+$CM* $ID $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PO;
+
+
+# LB 25
+($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
+
+# LB 26
+$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
+$CM* ($JT | $JV) $CM* ($H2 | $JV);
+$CM* $JT $CM* ($H3 | $JT);
+
+# LB 27
+$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+
+# LB 28
+$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+
+
+# LB 29
+$CM* ($ALPlus | $HL) $CM* $IS;
+
+# LB 30
+$CM* $OP $CM* ($ALPlus | $HL | $NU);
+$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+
+# LB 30a
+$CM* $RI $CM* $RI;
+
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# LB 9
+$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+$CM+ $SP / .;
+
+# LB 14
+$SP+ $CM* $OP;
+
+# LB 15
+$SP+ $CM* $QU;
+
+# LB 16
+$SP+ $CM* ($CL | $CP);
+
+# LB 17
+$SP+ $CM* $B2;
+
+# LB 21
+$CM* ($HY | $BA | $HH) $CM* $HL;
+
+# LB 25
+($CM* ($IS | $SY))+ $CM* $NU;
+($CL | $CP) $CM* ($NU | $IS | $SY);
+
+# For dictionary-based break
+$dictionary $dictionary;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# Skip forward over all character classes that are involved in
+# rules containing patterns with possibly more than one char
+# of context.
+#
+# It might be slightly more efficient to have specific rules
+# instead of one generic one, but only if we could
+# turn off rule chaining. We don't want to move more
+# than necessary.
+#
+[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
+$dictionary $dictionary;
+
--- /dev/null
+# Copyright (c) 2002-2015 International Business Machines Corporation and
+# others. All Rights Reserved.
+#
+# file: line_normal.txt
+#
+# Line Breaking Rules
+# Implement default line breaking as defined by
+# Unicode Standard Annex #14 Revision 29 for Unicode 6.2
+# http://www.unicode.org/reports/tr14/
+# tailored as noted in 2nd paragraph below..
+#
+# TODO: Rule LB 8 remains as it was in Unicode 5.2
+# This is only because of a limitation of ICU break engine implementation,
+# not because the older behavior is desirable.
+#
+# This tailors the line break behavior to correspond to CSS
+# line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
+# Chinese & Japanese.
+# It sets characters of class CJ to behave like ID.
+
+#
+# Character Classes defined by TR 14.
+#
+
+!!chain;
+!!LBCMNoChain;
+
+
+!!lookAheadHardBreak;
+#
+# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
+# and only used for the line break rules.
+#
+# It is used in the implementation of rule LB 10
+# which says to treat any combining mark that is not attached to a base
+# character as if it were of class AL (alphabetic).
+#
+# The problem occurs in the reverse rules.
+#
+# Consider a sequence like, with correct breaks as shown
+# LF ID CM AL AL
+# ^ ^ ^
+# Then consider the sequence without the initial ID (ideographic)
+# LF CM AL AL
+# ^ ^
+# Our CM, which in the first example was attached to the ideograph,
+# is now unattached, becomes an alpha, and joins in with the other
+# alphas.
+#
+# When iterating forwards, these sequences do not present any problems
+# When iterating backwards, we need to look ahead when encountering
+# a CM to see whether it attaches to something further on or not.
+# (Look-ahead in a reverse rule is looking towards the start)
+#
+# If the CM is unattached, we need to force a break.
+#
+# !!lookAheadHardBreak forces the run time state machine to
+# stop immediately when a look ahead rule ( '/' operator) matches,
+# and set the match position to that of the look-ahead operator,
+# no matter what other rules may be in play at the time.
+#
+# See rule LB 19 for an example.
+#
+
+$AI = [:LineBreak = Ambiguous:];
+$AL = [:LineBreak = Alphabetic:];
+$BA = [:LineBreak = Break_After:];
+$BB = [:LineBreak = Break_Before:];
+$BK = [:LineBreak = Mandatory_Break:];
+$B2 = [:LineBreak = Break_Both:];
+$CB = [:LineBreak = Contingent_Break:];
+$CJ = [:LineBreak = Conditional_Japanese_Starter:];
+$CL = [:LineBreak = Close_Punctuation:];
+$CM = [:LineBreak = Combining_Mark:];
+$CP = [:LineBreak = Close_Parenthesis:];
+$CR = [:LineBreak = Carriage_Return:];
+$EX = [:LineBreak = Exclamation:];
+$GL = [:LineBreak = Glue:];
+$HL = [:LineBreak = Hebrew_Letter:];
+$HY = [:LineBreak = Hyphen:];
+$H2 = [:LineBreak = H2:];
+$H3 = [:LineBreak = H3:];
+$ID = [[:LineBreak = Ideographic:] $CJ];
+$IN = [:LineBreak = Inseperable:];
+$IS = [:LineBreak = Infix_Numeric:];
+$JL = [:LineBreak = JL:];
+$JV = [:LineBreak = JV:];
+$JT = [:LineBreak = JT:];
+$LF = [:LineBreak = Line_Feed:];
+$NL = [:LineBreak = Next_Line:];
+$NS = [:LineBreak = Nonstarter:];
+$NU = [:LineBreak = Numeric:];
+$OP = [:LineBreak = Open_Punctuation:];
+$PO = [:LineBreak = Postfix_Numeric:];
+$PR = [:LineBreak = Prefix_Numeric:];
+$QU = [:LineBreak = Quotation:];
+$RI = [:LineBreak = Regional_Indicator:];
+$SA = [:LineBreak = Complex_Context:];
+$SG = [:LineBreak = Surrogate:];
+$SP = [:LineBreak = Space:];
+$SY = [:LineBreak = Break_Symbols:];
+$WJ = [:LineBreak = Word_Joiner:];
+$XX = [:LineBreak = Unknown:];
+$ZW = [:LineBreak = ZWSpace:];
+
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
+# Dictionary character set, for triggering language-based break engines. Currently
+# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+# 5.0 or later as the definition of Complex_Context was corrected to include all
+# characters requiring dictionary break.
+
+$dictionary = [:LineBreak = Complex_Context:];
+
+#
+# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
+# SA (South East Asian: Thai, Lao, Khmer)
+# SG (Unpaired Surrogates)
+# XX (Unknown, unassigned)
+# as $AL (Alphabetic)
+#
+$ALPlus = [$AL $AI $SA $SG $XX];
+
+#
+# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
+#
+$ALcm = $ALPlus $CM*;
+$BAcm = $BA $CM*;
+$BBcm = $BB $CM*;
+$B2cm = $B2 $CM*;
+$CLcm = $CL $CM*;
+$CPcm = $CP $CM*;
+$EXcm = $EX $CM*;
+$GLcm = $GL $CM*;
+$HLcm = $HL $CM*;
+$HYcm = $HY $CM*;
+$H2cm = $H2 $CM*;
+$H3cm = $H3 $CM*;
+$IDcm = $ID $CM*;
+$INcm = $IN $CM*;
+$IScm = $IS $CM*;
+$JLcm = $JL $CM*;
+$JVcm = $JV $CM*;
+$JTcm = $JT $CM*;
+$NScm = $NS $CM*;
+$NUcm = $NU $CM*;
+$OPcm = $OP $CM*;
+$POcm = $PO $CM*;
+$PRcm = $PR $CM*;
+$QUcm = $QU $CM*;
+$RIcm = $RI $CM*;
+$SYcm = $SY $CM*;
+$WJcm = $WJ $CM*;
+
+## -------------------------------------------------
+
+!!forward;
+
+#
+# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
+#
+$ALPlus $CM+;
+$BA $CM+;
+$BB $CM+;
+$B2 $CM+;
+$CL $CM+;
+$CP $CM+;
+$EX $CM+;
+$GL $CM+;
+$HL $CM+;
+$HY $CM+;
+$H2 $CM+;
+$H3 $CM+;
+$ID $CM+;
+$IN $CM+;
+$IS $CM+;
+$JL $CM+;
+$JV $CM+;
+$JT $CM+;
+$NS $CM+;
+$NU $CM+;
+$OP $CM+;
+$PO $CM+;
+$PR $CM+;
+$QU $CM+;
+$RI $CM+;
+$SY $CM+;
+$WJ $CM+;
+
+#
+# CAN_CM is the set of characters that may combine with CM combining chars.
+# Note that Linebreak UAX 14's concept of a combining char and the rules
+# for what they can combine with are _very_ different from the rest of Unicode.
+#
+# Note that $CM itself is left out of this set. If CM is needed as a base
+# it must be listed separately in the rule.
+#
+$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs
+$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
+
+#
+# AL_FOLLOW set of chars that can unconditionally follow an AL
+# Needed in rules where stand-alone $CM s are treated as AL.
+# Chaining is disabled with CM because it causes other failures,
+# so for this one case we need to manually list out longer sequences.
+#
+$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
+$AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $ALPlus];
+$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
+
+
+#
+# Rule LB 4, 5 Mandatory (Hard) breaks.
+#
+$LB4Breaks = [$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL];
+$CR $LF {100};
+
+#
+# LB 6 Do not break before hard line breaks.
+#
+$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
+$CAN_CM $CM* $LB4Breaks {100};
+$CM+ $LB4Breaks {100};
+
+# LB 7 x SP
+# x ZW
+$LB4NonBreaks [$SP $ZW];
+$CAN_CM $CM* [$SP $ZW];
+$CM+ [$SP $ZW];
+
+#
+# LB 8 Break after zero width space
+# TODO: ZW SP* <break>
+# An engine change is required to write the reverse rule for this.
+# For now, leave the Unicode 5.2 rule, ZW <break>
+#
+$LB8Breaks = [$LB4Breaks $ZW];
+$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+
+
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
+# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
+# $CM not covered by the above needs to behave like $AL
+# See definition of $CAN_CM.
+
+$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
+$CM+;
+
+#
+# LB 11 Do not break before or after WORD JOINER & related characters.
+#
+$CAN_CM $CM* $WJcm;
+$LB8NonBreaks $WJcm;
+$CM+ $WJcm;
+
+$WJcm $CANT_CM;
+$WJcm $CAN_CM $CM*;
+
+#
+# LB 12 Do not break after NBSP and related characters.
+# GL x
+#
+$GLcm $CAN_CM $CM*;
+$GLcm $CANT_CM;
+
+#
+# LB 12a Do not break before NBSP and related characters ...
+# [^SP BA HY] x GL
+#
+[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
+$CM+ GLcm;
+
+
+
+#
+# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces.
+#
+$LB8NonBreaks $CL;
+$CAN_CM $CM* $CL;
+$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $CP;
+$CAN_CM $CM* $CP;
+$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $EX;
+$CAN_CM $CM* $EX;
+$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $IS;
+$CAN_CM $CM* $IS;
+$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $SY;
+$CAN_CM $CM* $SY;
+$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
+
+
+#
+# LB 14 Do not break after OP, even after spaces
+#
+$OPcm $SP* $CAN_CM $CM*;
+$OPcm $SP* $CANT_CM;
+
+$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
+
+# LB 15
+$QUcm $SP* $OPcm;
+
+# LB 16
+($CLcm | $CPcm) $SP* $NScm;
+
+# LB 17
+$B2cm $SP* $B2cm;
+
+#
+# LB 18 Break after spaces.
+#
+$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
+$LB18Breaks = [$LB8Breaks $SP];
+
+
+# LB 19
+# x QU
+$LB18NonBreaks $CM* $QUcm;
+$CM+ $QUcm;
+
+# QU x
+$QUcm .?;
+$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
+ # TODO: I don't think this rule is needed.
+
+
+# LB 20
+# <break> $CB
+# $CB <break>
+
+$LB20NonBreaks = [$LB18NonBreaks - $CB];
+
+# LB 21 x (BA | HY | NS)
+# BB x
+#
+$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
+
+$BBcm [^$CB]; # $BB x
+$BBcm $LB20NonBreaks $CM*;
+
+# LB 21a Don't break after Hebrew + Hyphen
+# HL (HY | BA) x
+#
+$HLcm ($HYcm | $BAcm) [^$CB]?;
+
+# LB 21b (forward) Don't break between SY and HL
+# (break between HL and SY already disallowed by LB 13 above)
+$SYcm $HLcm;
+
+# LB 22
+($ALcm | $HLcm) $INcm;
+$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
+$IDcm $INcm;
+$INcm $INcm;
+$NUcm $INcm;
+
+
+# $LB 23
+$IDcm $POcm;
+$ALcm $NUcm; # includes $LB19
+$HLcm $NUcm;
+$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
+$NUcm $ALcm;
+$NUcm $HLcm;
+
+#
+# LB 24
+#
+$PRcm $IDcm;
+$PRcm ($ALcm | $HLcm);
+$POcm ($ALcm | $HLcm);
+
+#
+# LB 25 Numbers.
+#
+($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
+
+# LB 26 Do not break a Korean syllable
+#
+$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
+($JVcm | $H2cm) ($JVcm | $JTcm);
+($JTcm | $H3cm) $JTcm;
+
+# LB 27 Treat korean Syllable Block the same as ID (don't break it)
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
+$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
+
+
+# LB 28 Do not break between alphabetics
+#
+($ALcm | $HLcm) ($ALcm | $HLcm);
+$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
+
+# LB 29
+$IScm ($ALcm | $HLcm);
+
+# LB 30
+($ALcm | $HLcm | $NUcm) $OPcm;
+$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
+$CPcm ($ALcm | $HLcm | $NUcm);
+
+# LB 30a Do not break between regional indicators.
+$RIcm $RIcm;
+
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
+#
+# Reverse Rules.
+#
+## -------------------------------------------------
+
+!!reverse;
+
+$CM+ $ALPlus;
+$CM+ $BA;
+$CM+ $BB;
+$CM+ $B2;
+$CM+ $CL;
+$CM+ $CP;
+$CM+ $EX;
+$CM+ $GL;
+$CM+ $HL;
+$CM+ $HY;
+$CM+ $H2;
+$CM+ $H3;
+$CM+ $ID;
+$CM+ $IN;
+$CM+ $IS;
+$CM+ $JL;
+$CM+ $JV;
+$CM+ $JT;
+$CM+ $NS;
+$CM+ $NU;
+$CM+ $OP;
+$CM+ $PO;
+$CM+ $PR;
+$CM+ $QU;
+$CM+ $RI;
+$CM+ $SY;
+$CM+ $WJ;
+$CM+;
+
+
+#
+# Sequences of the form (shown forwards)
+# [CANT_CM] <break> [CM] [whatever]
+# The CM needs to behave as an AL
+#
+$AL_FOLLOW $CM+ / (
+ [$BK $CR $LF $NL $ZW {eof}] |
+ $SP+ $CM+ $SP |
+ $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
+ # LB14 says OP SP* x .
+ # becomes OP SP* x AL
+ # becomes OP SP* x CM+ AL_FOLLOW
+ #
+ # Further note: the $AL in [$AL {eof}] is only to work around
+ # a rule compiler bug which complains about
+ # empty sets otherwise.
+
+#
+# Sequences of the form (shown forwards)
+# [CANT_CM] <break> [CM] <break> [PR]
+# The CM needs to behave as an AL
+# This rule is concerned about getting the second of the two <breaks> in place.
+#
+
+[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
+
+
+
+# LB 4, 5, 5
+
+$LB4Breaks [$LB4NonBreaks-$CM];
+$LB4Breaks $CM+ $CAN_CM;
+$LF $CR;
+
+
+# LB 7 x SP
+# x ZW
+[$SP $ZW] [$LB4NonBreaks-$CM];
+[$SP $ZW] $CM+ $CAN_CM;
+
+# LB 8 ZW SP* <break>
+# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
+# Requires an engine enhancement.
+# / $SP* $ZW
+
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
+# LB 9,10 Combining marks.
+# X $CM needs to behave like X, where X is not $SP or controls.
+# $CM not covered by the above needs to behave like $AL
+# Stick together any combining sequences that don't match other rules.
+$CM+ $CAN_CM;
+
+
+# LB 11
+$CM* $WJ $CM* $CAN_CM;
+$CM* $WJ [$LB8NonBreaks-$CM];
+
+ $CANT_CM $CM* $WJ;
+$CM* $CAN_CM $CM* $WJ;
+
+# LB 12a
+# [^SP BA HY] x GL
+#
+$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
+
+# LB 12
+# GL x
+#
+$CANT_CM $CM* $GL;
+$CM* $CAN_CM $CM* $GL;
+
+
+# LB 13
+$CL $CM+ $CAN_CM;
+$CP $CM+ $CAN_CM;
+$EX $CM+ $CAN_CM;
+$IS $CM+ $CAN_CM;
+$SY $CM+ $CAN_CM;
+
+$CL [$LB8NonBreaks-$CM];
+$CP [$LB8NonBreaks-$CM];
+$EX [$LB8NonBreaks-$CM];
+$IS [$LB8NonBreaks-$CM];
+$SY [$LB8NonBreaks-$CM];
+
+# Rule 13 & 14 taken together for an edge case.
+# Match this, shown forward
+# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
+# This really wants to chain at the $CM+ (which is acting as an $AL)
+# except for $CM chaining being disabled.
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
+
+# LB 14 OP SP* x
+#
+$CM* $CAN_CM $SP* $CM* $OP;
+ $CANT_CM $SP* $CM* $OP;
+$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
+
+ $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
+$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
+
+
+
+# LB 15
+$CM* $OP $SP* $CM* $QU;
+
+# LB 16
+$CM* $NS $SP* $CM* ($CL | $CP);
+
+# LB 17
+$CM* $B2 $SP* $CM* $B2;
+
+# LB 18 break after spaces
+# Nothing explicit needed here.
+
+
+#
+# LB 19
+#
+$CM* $QU $CM* $CAN_CM; # . x QU
+$CM* $QU $LB18NonBreaks;
+
+
+$CM* $CAN_CM $CM* $QU; # QU x .
+ $CANT_CM $CM* $QU;
+
+#
+# LB 20 Break before and after CB.
+# nothing needed here.
+#
+
+# LB 21
+$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
+
+$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
+[^$CB] $CM* $BB; #
+
+# LB21a
+[^$CB] $CM* ($HY | $BA) $CM* $HL;
+
+# LB21b (reverse)
+$CM* $HL $CM* $SY;
+
+# LB 22
+$CM* $IN $CM* ($ALPlus | $HL);
+$CM* $IN $CM* $ID;
+$CM* $IN $CM* $IN;
+$CM* $IN $CM* $NU;
+
+# LB 23
+$CM* $PO $CM* $ID;
+$CM* $NU $CM* ($ALPlus | $HL);
+$CM* ($ALPlus | $HL) $CM* $NU;
+
+# LB 24
+$CM* $ID $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PO;
+
+
+# LB 25
+($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
+
+# LB 26
+$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
+$CM* ($JT | $JV) $CM* ($H2 | $JV);
+$CM* $JT $CM* ($H3 | $JT);
+
+# LB 27
+$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+
+# LB 28
+$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+
+
+# LB 29
+$CM* ($ALPlus | $HL) $CM* $IS;
+
+# LB 30
+$CM* $OP $CM* ($ALPlus | $HL | $NU);
+$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+
+# LB 30a
+$CM* $RI $CM* $RI;
+
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# LB 9
+$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+$CM+ $SP / .;
+
+# LB 14
+$SP+ $CM* $OP;
+
+# LB 15
+$SP+ $CM* $QU;
+
+# LB 16
+$SP+ $CM* ($CL | $CP);
+
+# LB 17
+$SP+ $CM* $B2;
+
+# LB 21
+$CM* ($HY | $BA) $CM* $HL;
+
+# LB 25
+($CM* ($IS | $SY))+ $CM* $NU;
+($CL | $CP) $CM* ($NU | $IS | $SY);
+
+# For dictionary-based break
+$dictionary $dictionary;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# Skip forward over all character classes that are involved in
+# rules containing patterns with possibly more than one char
+# of context.
+#
+# It might be slightly more efficient to have specific rules
+# instead of one generic one, but only if we could
+# turn off rule chaining. We don't want to move more
+# than necessary.
+#
+[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
+$dictionary $dictionary;
+
--- /dev/null
+# Copyright (c) 2002-2015 International Business Machines Corporation and
+# others. All Rights Reserved.
+#
+# file: line_normal_cj.txt
+#
+# Line Breaking Rules
+# Implement default line breaking as defined by
+# Unicode Standard Annex #14 Revision 29 for Unicode 6.2
+# http://www.unicode.org/reports/tr14/
+# tailored as noted in 2nd paragraph below..
+#
+# TODO: Rule LB 8 remains as it was in Unicode 5.2
+# This is only because of a limitation of ICU break engine implementation,
+# not because the older behavior is desirable.
+#
+# This tailors the line break behavior to correspond to CSS
+# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
+# It sets characters of class CJ to behave like ID.
+# In addition, it allows breaks:
+# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+
+#
+# Character Classes defined by TR 14.
+#
+
+!!chain;
+!!LBCMNoChain;
+
+
+!!lookAheadHardBreak;
+#
+# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
+# and only used for the line break rules.
+#
+# It is used in the implementation of rule LB 10
+# which says to treat any combining mark that is not attached to a base
+# character as if it were of class AL (alphabetic).
+#
+# The problem occurs in the reverse rules.
+#
+# Consider a sequence like, with correct breaks as shown
+# LF ID CM AL AL
+# ^ ^ ^
+# Then consider the sequence without the initial ID (ideographic)
+# LF CM AL AL
+# ^ ^
+# Our CM, which in the first example was attached to the ideograph,
+# is now unattached, becomes an alpha, and joins in with the other
+# alphas.
+#
+# When iterating forwards, these sequences do not present any problems
+# When iterating backwards, we need to look ahead when encountering
+# a CM to see whether it attaches to something further on or not.
+# (Look-ahead in a reverse rule is looking towards the start)
+#
+# If the CM is unattached, we need to force a break.
+#
+# !!lookAheadHardBreak forces the run time state machine to
+# stop immediately when a look ahead rule ( '/' operator) matches,
+# and set the match position to that of the look-ahead operator,
+# no matter what other rules may be in play at the time.
+#
+# See rule LB 19 for an example.
+#
+
+$AI = [:LineBreak = Ambiguous:];
+$AL = [:LineBreak = Alphabetic:];
+$BAX = [\u2010 \u2013];
+$BA = [[:LineBreak = Break_After:] - $BAX];
+$BB = [:LineBreak = Break_Before:];
+$BK = [:LineBreak = Mandatory_Break:];
+$B2 = [:LineBreak = Break_Both:];
+$CB = [:LineBreak = Contingent_Break:];
+$CJ = [:LineBreak = Conditional_Japanese_Starter:];
+$CL = [:LineBreak = Close_Punctuation:];
+$CM = [:LineBreak = Combining_Mark:];
+$CP = [:LineBreak = Close_Parenthesis:];
+$CR = [:LineBreak = Carriage_Return:];
+$EX = [:LineBreak = Exclamation:];
+$GL = [:LineBreak = Glue:];
+$HL = [:LineBreak = Hebrew_Letter:];
+$HY = [:LineBreak = Hyphen:];
+$H2 = [:LineBreak = H2:];
+$H3 = [:LineBreak = H3:];
+$ID = [[:LineBreak = Ideographic:] $CJ];
+$IN = [:LineBreak = Inseperable:];
+$IS = [:LineBreak = Infix_Numeric:];
+$JL = [:LineBreak = JL:];
+$JV = [:LineBreak = JV:];
+$JT = [:LineBreak = JT:];
+$LF = [:LineBreak = Line_Feed:];
+$NL = [:LineBreak = Next_Line:];
+$NSX = [\u301C \u30A0];
+$NS = [[:LineBreak = Nonstarter:] - $NSX];
+$NU = [:LineBreak = Numeric:];
+$OP = [:LineBreak = Open_Punctuation:];
+$PO = [:LineBreak = Postfix_Numeric:];
+$PR = [:LineBreak = Prefix_Numeric:];
+$QU = [:LineBreak = Quotation:];
+$RI = [:LineBreak = Regional_Indicator:];
+$SA = [:LineBreak = Complex_Context:];
+$SG = [:LineBreak = Surrogate:];
+$SP = [:LineBreak = Space:];
+$SY = [:LineBreak = Break_Symbols:];
+$WJ = [:LineBreak = Word_Joiner:];
+$XX = [:LineBreak = Unknown:];
+$ZW = [:LineBreak = ZWSpace:];
+
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
+# Dictionary character set, for triggering language-based break engines. Currently
+# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+# 5.0 or later as the definition of Complex_Context was corrected to include all
+# characters requiring dictionary break.
+
+$dictionary = [:LineBreak = Complex_Context:];
+
+#
+# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
+# SA (South East Asian: Thai, Lao, Khmer)
+# SG (Unpaired Surrogates)
+# XX (Unknown, unassigned)
+# as $AL (Alphabetic)
+#
+$ALPlus = [$AL $AI $SA $SG $XX];
+
+#
+# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
+#
+$ALcm = $ALPlus $CM*;
+$BAcm = $BA $CM*;
+$BAXcm = $BAX $CM*;
+$BBcm = $BB $CM*;
+$B2cm = $B2 $CM*;
+$CLcm = $CL $CM*;
+$CPcm = $CP $CM*;
+$EXcm = $EX $CM*;
+$GLcm = $GL $CM*;
+$HLcm = $HL $CM*;
+$HYcm = $HY $CM*;
+$H2cm = $H2 $CM*;
+$H3cm = $H3 $CM*;
+$IDcm = $ID $CM*;
+$INcm = $IN $CM*;
+$IScm = $IS $CM*;
+$JLcm = $JL $CM*;
+$JVcm = $JV $CM*;
+$JTcm = $JT $CM*;
+$NScm = $NS $CM*;
+$NSXcm = $NSX $CM*;
+$NUcm = $NU $CM*;
+$OPcm = $OP $CM*;
+$POcm = $PO $CM*;
+$PRcm = $PR $CM*;
+$QUcm = $QU $CM*;
+$RIcm = $RI $CM*;
+$SYcm = $SY $CM*;
+$WJcm = $WJ $CM*;
+
+## -------------------------------------------------
+
+!!forward;
+
+#
+# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
+#
+$ALPlus $CM+;
+$BA $CM+;
+$BAX $CM+;
+$BB $CM+;
+$B2 $CM+;
+$CL $CM+;
+$CP $CM+;
+$EX $CM+;
+$GL $CM+;
+$HL $CM+;
+$HY $CM+;
+$H2 $CM+;
+$H3 $CM+;
+$ID $CM+;
+$IN $CM+;
+$IS $CM+;
+$JL $CM+;
+$JV $CM+;
+$JT $CM+;
+$NS $CM+;
+$NSX $CM+;
+$NU $CM+;
+$OP $CM+;
+$PO $CM+;
+$PR $CM+;
+$QU $CM+;
+$RI $CM+;
+$SY $CM+;
+$WJ $CM+;
+
+#
+# CAN_CM is the set of characters that may combine with CM combining chars.
+# Note that Linebreak UAX 14's concept of a combining char and the rules
+# for what they can combine with are _very_ different from the rest of Unicode.
+#
+# Note that $CM itself is left out of this set. If CM is needed as a base
+# it must be listed separately in the rule.
+#
+$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs
+$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
+
+#
+# AL_FOLLOW set of chars that can unconditionally follow an AL
+# Needed in rules where stand-alone $CM s are treated as AL.
+# Chaining is disabled with CM because it causes other failures,
+# so for this one case we need to manually list out longer sequences.
+#
+$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
+$AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $BAX $HY $NS $NSX $IN $NU $ALPlus];
+$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
+
+
+#
+# Rule LB 4, 5 Mandatory (Hard) breaks.
+#
+$LB4Breaks = [$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL];
+$CR $LF {100};
+
+#
+# LB 6 Do not break before hard line breaks.
+#
+$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
+$CAN_CM $CM* $LB4Breaks {100};
+$CM+ $LB4Breaks {100};
+
+# LB 7 x SP
+# x ZW
+$LB4NonBreaks [$SP $ZW];
+$CAN_CM $CM* [$SP $ZW];
+$CM+ [$SP $ZW];
+
+#
+# LB 8 Break after zero width space
+# TODO: ZW SP* <break>
+# An engine change is required to write the reverse rule for this.
+# For now, leave the Unicode 5.2 rule, ZW <break>
+#
+$LB8Breaks = [$LB4Breaks $ZW];
+$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+
+
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
+# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
+# $CM not covered by the above needs to behave like $AL
+# See definition of $CAN_CM.
+
+$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
+$CM+;
+
+#
+# LB 11 Do not break before or after WORD JOINER & related characters.
+#
+$CAN_CM $CM* $WJcm;
+$LB8NonBreaks $WJcm;
+$CM+ $WJcm;
+
+$WJcm $CANT_CM;
+$WJcm $CAN_CM $CM*;
+
+#
+# LB 12 Do not break after NBSP and related characters.
+# GL x
+#
+$GLcm $CAN_CM $CM*;
+$GLcm $CANT_CM;
+
+#
+# LB 12a Do not break before NBSP and related characters ...
+# [^SP BA HY] x GL
+#
+[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm;
+$CM+ GLcm;
+
+
+
+#
+# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces.
+#
+$LB8NonBreaks $CL;
+$CAN_CM $CM* $CL;
+$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $CP;
+$CAN_CM $CM* $CP;
+$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $EX;
+$CAN_CM $CM* $EX;
+$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $IS;
+$CAN_CM $CM* $IS;
+$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $SY;
+$CAN_CM $CM* $SY;
+$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
+
+
+#
+# LB 14 Do not break after OP, even after spaces
+#
+$OPcm $SP* $CAN_CM $CM*;
+$OPcm $SP* $CANT_CM;
+
+$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
+
+# LB 15
+$QUcm $SP* $OPcm;
+
+# LB 16
+# Do not break between closing punctuation and $NS, even with intervening spaces
+# But DO allow a break between closing punctuation and $NSX, don't include it here
+($CLcm | $CPcm) $SP* $NScm;
+
+# LB 17
+$B2cm $SP* $B2cm;
+
+#
+# LB 18 Break after spaces.
+#
+$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
+$LB18Breaks = [$LB8Breaks $SP];
+
+
+# LB 19
+# x QU
+$LB18NonBreaks $CM* $QUcm;
+$CM+ $QUcm;
+
+# QU x
+$QUcm .?;
+$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
+ # TODO: I don't think this rule is needed.
+
+
+# LB 20
+# <break> $CB
+# $CB <break>
+
+$LB20NonBreaks = [$LB18NonBreaks - $CB];
+
+# LB 21 x (BA | HY | NS)
+# BB x
+#
+# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
+$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
+
+$BBcm [^$CB]; # $BB x
+$BBcm $LB20NonBreaks $CM*;
+
+# LB 21a Don't break after Hebrew + Hyphen
+# HL (HY | BA) x
+#
+$HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?;
+
+# LB 21b (forward) Don't break between SY and HL
+# (break between HL and SY already disallowed by LB 13 above)
+$SYcm $HLcm;
+
+# LB 22
+($ALcm | $HLcm) $INcm;
+$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
+$IDcm $INcm;
+$INcm $INcm;
+$NUcm $INcm;
+
+
+# $LB 23
+$IDcm $POcm;
+$ALcm $NUcm; # includes $LB19
+$HLcm $NUcm;
+$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
+$NUcm $ALcm;
+$NUcm $HLcm;
+
+#
+# LB 24
+#
+$PRcm $IDcm;
+$PRcm ($ALcm | $HLcm);
+$POcm ($ALcm | $HLcm);
+
+#
+# LB 25 Numbers.
+#
+($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
+
+# LB 26 Do not break a Korean syllable
+#
+$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
+($JVcm | $H2cm) ($JVcm | $JTcm);
+($JTcm | $H3cm) $JTcm;
+
+# LB 27 Treat korean Syllable Block the same as ID (don't break it)
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
+$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
+
+
+# LB 28 Do not break between alphabetics
+#
+($ALcm | $HLcm) ($ALcm | $HLcm);
+$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
+
+# LB 29
+$IScm ($ALcm | $HLcm);
+
+# LB 30
+($ALcm | $HLcm | $NUcm) $OPcm;
+$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
+$CPcm ($ALcm | $HLcm | $NUcm);
+
+# LB 30a Do not break between regional indicators.
+$RIcm $RIcm;
+
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
+#
+# Reverse Rules.
+#
+## -------------------------------------------------
+
+!!reverse;
+
+$CM+ $ALPlus;
+$CM+ $BA;
+$CM+ $BAX;
+$CM+ $BB;
+$CM+ $B2;
+$CM+ $CL;
+$CM+ $CP;
+$CM+ $EX;
+$CM+ $GL;
+$CM+ $HL;
+$CM+ $HY;
+$CM+ $H2;
+$CM+ $H3;
+$CM+ $ID;
+$CM+ $IN;
+$CM+ $IS;
+$CM+ $JL;
+$CM+ $JV;
+$CM+ $JT;
+$CM+ $NS;
+$CM+ $NSX;
+$CM+ $NU;
+$CM+ $OP;
+$CM+ $PO;
+$CM+ $PR;
+$CM+ $QU;
+$CM+ $RI;
+$CM+ $SY;
+$CM+ $WJ;
+$CM+;
+
+
+#
+# Sequences of the form (shown forwards)
+# [CANT_CM] <break> [CM] [whatever]
+# The CM needs to behave as an AL
+#
+$AL_FOLLOW $CM+ / (
+ [$BK $CR $LF $NL $ZW {eof}] |
+ $SP+ $CM+ $SP |
+ $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
+ # LB14 says OP SP* x .
+ # becomes OP SP* x AL
+ # becomes OP SP* x CM+ AL_FOLLOW
+ #
+ # Further note: the $AL in [$AL {eof}] is only to work around
+ # a rule compiler bug which complains about
+ # empty sets otherwise.
+
+#
+# Sequences of the form (shown forwards)
+# [CANT_CM] <break> [CM] <break> [PR]
+# The CM needs to behave as an AL
+# This rule is concerned about getting the second of the two <breaks> in place.
+#
+
+[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
+
+
+
+# LB 4, 5, 5
+
+$LB4Breaks [$LB4NonBreaks-$CM];
+$LB4Breaks $CM+ $CAN_CM;
+$LF $CR;
+
+
+# LB 7 x SP
+# x ZW
+[$SP $ZW] [$LB4NonBreaks-$CM];
+[$SP $ZW] $CM+ $CAN_CM;
+
+# LB 8 ZW SP* <break>
+# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
+# Requires an engine enhancement.
+# / $SP* $ZW
+
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
+# LB 9,10 Combining marks.
+# X $CM needs to behave like X, where X is not $SP or controls.
+# $CM not covered by the above needs to behave like $AL
+# Stick together any combining sequences that don't match other rules.
+$CM+ $CAN_CM;
+
+
+# LB 11
+$CM* $WJ $CM* $CAN_CM;
+$CM* $WJ [$LB8NonBreaks-$CM];
+
+ $CANT_CM $CM* $WJ;
+$CM* $CAN_CM $CM* $WJ;
+
+# LB 12a
+# [^SP BA HY] x GL
+#
+$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
+
+# LB 12
+# GL x
+#
+$CANT_CM $CM* $GL;
+$CM* $CAN_CM $CM* $GL;
+
+
+# LB 13
+$CL $CM+ $CAN_CM;
+$CP $CM+ $CAN_CM;
+$EX $CM+ $CAN_CM;
+$IS $CM+ $CAN_CM;
+$SY $CM+ $CAN_CM;
+
+$CL [$LB8NonBreaks-$CM];
+$CP [$LB8NonBreaks-$CM];
+$EX [$LB8NonBreaks-$CM];
+$IS [$LB8NonBreaks-$CM];
+$SY [$LB8NonBreaks-$CM];
+
+# Rule 13 & 14 taken together for an edge case.
+# Match this, shown forward
+# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
+# This really wants to chain at the $CM+ (which is acting as an $AL)
+# except for $CM chaining being disabled.
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
+
+# LB 14 OP SP* x
+#
+$CM* $CAN_CM $SP* $CM* $OP;
+ $CANT_CM $SP* $CM* $OP;
+$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
+
+ $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
+$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
+
+
+
+# LB 15
+$CM* $OP $SP* $CM* $QU;
+
+# LB 16
+# Don't include $NSX here
+$CM* $NS $SP* $CM* ($CL | $CP);
+
+# LB 17
+$CM* $B2 $SP* $CM* $B2;
+
+# LB 18 break after spaces
+# Nothing explicit needed here.
+
+
+#
+# LB 19
+#
+$CM* $QU $CM* $CAN_CM; # . x QU
+$CM* $QU $LB18NonBreaks;
+
+
+$CM* $CAN_CM $CM* $QU; # QU x .
+ $CANT_CM $CM* $QU;
+
+#
+# LB 20 Break before and after CB.
+# nothing needed here.
+#
+
+# LB 21
+# Don't include $BAX or $NSX here
+$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
+
+$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
+[^$CB] $CM* $BB; #
+
+# LB21a
+[^$CB] $CM* ($HY | $BA | $BAX) $CM* $HL;
+
+# LB21b (reverse)
+$CM* $HL $CM* $SY;
+
+# LB 22
+$CM* $IN $CM* ($ALPlus | $HL);
+$CM* $IN $CM* $ID;
+$CM* $IN $CM* $IN;
+$CM* $IN $CM* $NU;
+
+# LB 23
+$CM* $PO $CM* $ID;
+$CM* $NU $CM* ($ALPlus | $HL);
+$CM* ($ALPlus | $HL) $CM* $NU;
+
+# LB 24
+$CM* $ID $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PO;
+
+
+# LB 25
+($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
+
+# LB 26
+$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
+$CM* ($JT | $JV) $CM* ($H2 | $JV);
+$CM* $JT $CM* ($H3 | $JT);
+
+# LB 27
+$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+
+# LB 28
+$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+
+
+# LB 29
+$CM* ($ALPlus | $HL) $CM* $IS;
+
+# LB 30
+$CM* $OP $CM* ($ALPlus | $HL | $NU);
+$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+
+# LB 30a
+$CM* $RI $CM* $RI;
+
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# LB 9
+$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+$CM+ $SP / .;
+
+# LB 14
+$SP+ $CM* $OP;
+
+# LB 15
+$SP+ $CM* $QU;
+
+# LB 16
+$SP+ $CM* ($CL | $CP);
+
+# LB 17
+$SP+ $CM* $B2;
+
+# LB 21
+$CM* ($HY | $BA) $CM* $HL;
+
+# LB 25
+($CM* ($IS | $SY))+ $CM* $NU;
+($CL | $CP) $CM* ($NU | $IS | $SY);
+
+# For dictionary-based break
+$dictionary $dictionary;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# Skip forward over all character classes that are involved in
+# rules containing patterns with possibly more than one char
+# of context.
+#
+# It might be slightly more efficient to have specific rules
+# instead of one generic one, but only if we could
+# turn off rule chaining. We don't want to move more
+# than necessary.
+#
+[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
+$dictionary $dictionary;
+
--- /dev/null
+# Copyright (c) 2002-2015 International Business Machines Corporation and
+# others. All Rights Reserved.
+#
+# file: line_normal_fi.txt
+#
+# Line Breaking Rules
+# Implement default line breaking as defined by
+# Unicode Standard Annex #14 Revision 29 for Unicode 6.2
+# http://www.unicode.org/reports/tr14/
+# tailored as noted in 2nd paragraph below..
+#
+# TODO: Rule LB 8 remains as it was in Unicode 5.2
+# This is only because of a limitation of ICU break engine implementation,
+# not because the older behavior is desirable.
+#
+# This tailors the line break behavior both for Finnish and to correpond to CSS
+# line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
+# Chinese & Japanese.
+# It sets characters of class CJ to behave like ID.
+
+#
+# Character Classes defined by TR 14.
+#
+
+!!chain;
+!!LBCMNoChain;
+
+
+!!lookAheadHardBreak;
+#
+# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
+# and only used for the line break rules.
+#
+# It is used in the implementation of rule LB 10
+# which says to treat any combining mark that is not attached to a base
+# character as if it were of class AL (alphabetic).
+#
+# The problem occurs in the reverse rules.
+#
+# Consider a sequence like, with correct breaks as shown
+# LF ID CM AL AL
+# ^ ^ ^
+# Then consider the sequence without the initial ID (ideographic)
+# LF CM AL AL
+# ^ ^
+# Our CM, which in the first example was attached to the ideograph,
+# is now unattached, becomes an alpha, and joins in with the other
+# alphas.
+#
+# When iterating forwards, these sequences do not present any problems
+# When iterating backwards, we need to look ahead when encountering
+# a CM to see whether it attaches to something further on or not.
+# (Look-ahead in a reverse rule is looking towards the start)
+#
+# If the CM is unattached, we need to force a break.
+#
+# !!lookAheadHardBreak forces the run time state machine to
+# stop immediately when a look ahead rule ( '/' operator) matches,
+# and set the match position to that of the look-ahead operator,
+# no matter what other rules may be in play at the time.
+#
+# See rule LB 19 for an example.
+#
+
+$AI = [:LineBreak = Ambiguous:];
+$AL = [:LineBreak = Alphabetic:];
+$BA = [[:LineBreak = Break_After:] - [\u2010]];
+$HH = [\u2010];
+$BB = [:LineBreak = Break_Before:];
+$BK = [:LineBreak = Mandatory_Break:];
+$B2 = [:LineBreak = Break_Both:];
+$CB = [:LineBreak = Contingent_Break:];
+$CJ = [:LineBreak = Conditional_Japanese_Starter:];
+$CL = [:LineBreak = Close_Punctuation:];
+$CM = [:LineBreak = Combining_Mark:];
+$CP = [:LineBreak = Close_Parenthesis:];
+$CR = [:LineBreak = Carriage_Return:];
+$EX = [:LineBreak = Exclamation:];
+$GL = [:LineBreak = Glue:];
+$HL = [:LineBreak = Hebrew_Letter:];
+$HY = [:LineBreak = Hyphen:];
+$H2 = [:LineBreak = H2:];
+$H3 = [:LineBreak = H3:];
+$ID = [[:LineBreak = Ideographic:] $CJ];
+$IN = [:LineBreak = Inseperable:];
+$IS = [:LineBreak = Infix_Numeric:];
+$JL = [:LineBreak = JL:];
+$JV = [:LineBreak = JV:];
+$JT = [:LineBreak = JT:];
+$LF = [:LineBreak = Line_Feed:];
+$NL = [:LineBreak = Next_Line:];
+$NS = [:LineBreak = Nonstarter:];
+$NU = [:LineBreak = Numeric:];
+$OP = [:LineBreak = Open_Punctuation:];
+$PO = [:LineBreak = Postfix_Numeric:];
+$PR = [:LineBreak = Prefix_Numeric:];
+$QU = [:LineBreak = Quotation:];
+$RI = [:LineBreak = Regional_Indicator:];
+$SA = [:LineBreak = Complex_Context:];
+$SG = [:LineBreak = Surrogate:];
+$SP = [:LineBreak = Space:];
+$SY = [:LineBreak = Break_Symbols:];
+$WJ = [:LineBreak = Word_Joiner:];
+$XX = [:LineBreak = Unknown:];
+$ZW = [:LineBreak = ZWSpace:];
+
+# Special character classes for people & body part emoji:
+# Subsets of $CM:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of $ID
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
+
+# Dictionary character set, for triggering language-based break engines. Currently
+# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+# 5.0 or later as the definition of Complex_Context was corrected to include all
+# characters requiring dictionary break.
+
+$dictionary = [:LineBreak = Complex_Context:];
+
+#
+# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
+# SA (South East Asian: Thai, Lao, Khmer)
+# SG (Unpaired Surrogates)
+# XX (Unknown, unassigned)
+# as $AL (Alphabetic)
+#
+$ALPlus = [$AL $AI $SA $SG $XX];
+
+#
+# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
+#
+$ALcm = $ALPlus $CM*;
+$BAcm = $BA $CM*;
+$HHcm = $HH $CM*;
+$BBcm = $BB $CM*;
+$B2cm = $B2 $CM*;
+$CLcm = $CL $CM*;
+$CPcm = $CP $CM*;
+$EXcm = $EX $CM*;
+$GLcm = $GL $CM*;
+$HLcm = $HL $CM*;
+$HYcm = $HY $CM*;
+$H2cm = $H2 $CM*;
+$H3cm = $H3 $CM*;
+$IDcm = $ID $CM*;
+$INcm = $IN $CM*;
+$IScm = $IS $CM*;
+$JLcm = $JL $CM*;
+$JVcm = $JV $CM*;
+$JTcm = $JT $CM*;
+$NScm = $NS $CM*;
+$NUcm = $NU $CM*;
+$OPcm = $OP $CM*;
+$POcm = $PO $CM*;
+$PRcm = $PR $CM*;
+$QUcm = $QU $CM*;
+$RIcm = $RI $CM*;
+$SYcm = $SY $CM*;
+$WJcm = $WJ $CM*;
+
+## -------------------------------------------------
+
+!!forward;
+
+#
+# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
+#
+$ALPlus $CM+;
+$BA $CM+;
+$HH $CM+;
+$BB $CM+;
+$B2 $CM+;
+$CL $CM+;
+$CP $CM+;
+$EX $CM+;
+$GL $CM+;
+$HL $CM+;
+$HY $CM+;
+$H2 $CM+;
+$H3 $CM+;
+$ID $CM+;
+$IN $CM+;
+$IS $CM+;
+$JL $CM+;
+$JV $CM+;
+$JT $CM+;
+$NS $CM+;
+$NU $CM+;
+$OP $CM+;
+$PO $CM+;
+$PR $CM+;
+$QU $CM+;
+$RI $CM+;
+$SY $CM+;
+$WJ $CM+;
+
+#
+# CAN_CM is the set of characters that may combine with CM combining chars.
+# Note that Linebreak UAX 14's concept of a combining char and the rules
+# for what they can combine with are _very_ different from the rest of Unicode.
+#
+# Note that $CM itself is left out of this set. If CM is needed as a base
+# it must be listed separately in the rule.
+#
+$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs
+$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
+
+#
+# AL_FOLLOW set of chars that can unconditionally follow an AL
+# Needed in rules where stand-alone $CM s are treated as AL.
+# Chaining is disabled with CM because it causes other failures,
+# so for this one case we need to manually list out longer sequences.
+#
+$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
+$AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $ALPlus];
+$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
+
+
+#
+# Rule LB 4, 5 Mandatory (Hard) breaks.
+#
+$LB4Breaks = [$BK $CR $LF $NL];
+$LB4NonBreaks = [^$BK $CR $LF $NL];
+$CR $LF {100};
+
+#
+# LB 6 Do not break before hard line breaks.
+#
+$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
+$CAN_CM $CM* $LB4Breaks {100};
+$CM+ $LB4Breaks {100};
+
+# LB 7 x SP
+# x ZW
+$LB4NonBreaks [$SP $ZW];
+$CAN_CM $CM* [$SP $ZW];
+$CM+ [$SP $ZW];
+
+#
+# LB 8 Break after zero width space
+# TODO: ZW SP* <break>
+# An engine change is required to write the reverse rule for this.
+# For now, leave the Unicode 5.2 rule, ZW <break>
+#
+$LB8Breaks = [$LB4Breaks $ZW];
+$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+
+
+# Special forward rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $EmojiVar? $EmojiMods? $ZWJ $EmojiForSeqs;
+
+# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
+# $CM not covered by the above needs to behave like $AL
+# See definition of $CAN_CM.
+
+$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
+$CM+;
+
+#
+# LB 11 Do not break before or after WORD JOINER & related characters.
+#
+$CAN_CM $CM* $WJcm;
+$LB8NonBreaks $WJcm;
+$CM+ $WJcm;
+
+$WJcm $CANT_CM;
+$WJcm $CAN_CM $CM*;
+
+#
+# LB 12 Do not break after NBSP and related characters.
+# GL x
+#
+$GLcm $CAN_CM $CM*;
+$GLcm $CANT_CM;
+
+#
+# LB 12a Do not break before NBSP and related characters ...
+# [^SP BA HY] x GL
+#
+[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm;
+$CM+ GLcm;
+
+
+
+#
+# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces.
+#
+$LB8NonBreaks $CL;
+$CAN_CM $CM* $CL;
+$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $CP;
+$CAN_CM $CM* $CP;
+$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $EX;
+$CAN_CM $CM* $EX;
+$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $IS;
+$CAN_CM $CM* $IS;
+$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
+
+$LB8NonBreaks $SY;
+$CAN_CM $CM* $SY;
+$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
+
+
+#
+# LB 14 Do not break after OP, even after spaces
+#
+$OPcm $SP* $CAN_CM $CM*;
+$OPcm $SP* $CANT_CM;
+
+$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
+
+# LB 15
+$QUcm $SP* $OPcm;
+
+# LB 16
+($CLcm | $CPcm) $SP* $NScm;
+
+# LB 17
+$B2cm $SP* $B2cm;
+
+#
+# LB 18 Break after spaces.
+#
+$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
+$LB18Breaks = [$LB8Breaks $SP];
+
+
+# LB 19
+# x QU
+$LB18NonBreaks $CM* $QUcm;
+$CM+ $QUcm;
+
+# QU x
+$QUcm .?;
+$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
+ # TODO: I don't think this rule is needed.
+
+
+# LB 20
+# <break> $CB
+# $CB <break>
+
+$LB20NonBreaks = [$LB18NonBreaks - $CB];
+
+# LB 20.09 added rule for Finnish tailoring
+# LB 21 x (BA | HY | NS)
+# BB x
+#
+$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL;
+$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm);
+($HY | $HH) $AL;
+
+$BBcm [^$CB]; # $BB x
+$BBcm $LB20NonBreaks $CM*;
+
+# LB 21a Don't break after Hebrew + Hyphen
+# HL (HY | BA) x
+#
+$HLcm ($HYcm | $BAcm | $HHcm) [^$CB]?;
+
+# LB 21b (forward) Don't break between SY and HL
+# (break between HL and SY already disallowed by LB 13 above)
+$SYcm $HLcm;
+
+# LB 22
+($ALcm | $HLcm) $INcm;
+$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
+$IDcm $INcm;
+$INcm $INcm;
+$NUcm $INcm;
+
+
+# $LB 23
+$IDcm $POcm;
+$ALcm $NUcm; # includes $LB19
+$HLcm $NUcm;
+$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
+$NUcm $ALcm;
+$NUcm $HLcm;
+
+#
+# LB 24
+#
+$PRcm $IDcm;
+$PRcm ($ALcm | $HLcm);
+$POcm ($ALcm | $HLcm);
+
+#
+# LB 25 Numbers.
+#
+($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;
+
+# LB 26 Do not break a Korean syllable
+#
+$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
+($JVcm | $H2cm) ($JVcm | $JTcm);
+($JTcm | $H3cm) $JTcm;
+
+# LB 27 Treat korean Syllable Block the same as ID (don't break it)
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
+($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
+$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
+
+
+# LB 28 Do not break between alphabetics
+#
+($ALcm | $HLcm) ($ALcm | $HLcm);
+$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
+
+# LB 29
+$IScm ($ALcm | $HLcm);
+
+# LB 30
+($ALcm | $HLcm | $NUcm) $OPcm;
+$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
+$CPcm ($ALcm | $HLcm | $NUcm);
+
+# LB 30a Do not break between regional indicators.
+$RIcm $RIcm;
+
+# Special forward rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiForMods $EmojiVar? $EmojiMods;
+
+#
+# Reverse Rules.
+#
+## -------------------------------------------------
+
+!!reverse;
+
+$CM+ $ALPlus;
+$CM+ $BA;
+$CM+ $HH;
+$CM+ $BB;
+$CM+ $B2;
+$CM+ $CL;
+$CM+ $CP;
+$CM+ $EX;
+$CM+ $GL;
+$CM+ $HL;
+$CM+ $HY;
+$CM+ $H2;
+$CM+ $H3;
+$CM+ $ID;
+$CM+ $IN;
+$CM+ $IS;
+$CM+ $JL;
+$CM+ $JV;
+$CM+ $JT;
+$CM+ $NS;
+$CM+ $NU;
+$CM+ $OP;
+$CM+ $PO;
+$CM+ $PR;
+$CM+ $QU;
+$CM+ $RI;
+$CM+ $SY;
+$CM+ $WJ;
+$CM+;
+
+
+#
+# Sequences of the form (shown forwards)
+# [CANT_CM] <break> [CM] [whatever]
+# The CM needs to behave as an AL
+#
+$AL_FOLLOW $CM+ / (
+ [$BK $CR $LF $NL $ZW {eof}] |
+ $SP+ $CM+ $SP |
+ $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
+ # LB14 says OP SP* x .
+ # becomes OP SP* x AL
+ # becomes OP SP* x CM+ AL_FOLLOW
+ #
+ # Further note: the $AL in [$AL {eof}] is only to work around
+ # a rule compiler bug which complains about
+ # empty sets otherwise.
+
+#
+# Sequences of the form (shown forwards)
+# [CANT_CM] <break> [CM] <break> [PR]
+# The CM needs to behave as an AL
+# This rule is concerned about getting the second of the two <breaks> in place.
+#
+
+[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
+
+
+
+# LB 4, 5, 5
+
+$LB4Breaks [$LB4NonBreaks-$CM];
+$LB4Breaks $CM+ $CAN_CM;
+$LF $CR;
+
+
+# LB 7 x SP
+# x ZW
+[$SP $ZW] [$LB4NonBreaks-$CM];
+[$SP $ZW] $CM+ $CAN_CM;
+
+# LB 8 ZW SP* <break>
+# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
+# Requires an engine enhancement.
+# / $SP* $ZW
+
+# Special reverse rule for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs
+$EmojiForSeqs $ZWJ $EmojiMods? $EmojiVar? $EmojiForSeqs;
+
+# LB 9,10 Combining marks.
+# X $CM needs to behave like X, where X is not $SP or controls.
+# $CM not covered by the above needs to behave like $AL
+# Stick together any combining sequences that don't match other rules.
+$CM+ $CAN_CM;
+
+
+# LB 11
+$CM* $WJ $CM* $CAN_CM;
+$CM* $WJ [$LB8NonBreaks-$CM];
+
+ $CANT_CM $CM* $WJ;
+$CM* $CAN_CM $CM* $WJ;
+
+# LB 12a
+# [^SP BA HY] x GL
+#
+$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
+
+# LB 12
+# GL x
+#
+$CANT_CM $CM* $GL;
+$CM* $CAN_CM $CM* $GL;
+
+
+# LB 13
+$CL $CM+ $CAN_CM;
+$CP $CM+ $CAN_CM;
+$EX $CM+ $CAN_CM;
+$IS $CM+ $CAN_CM;
+$SY $CM+ $CAN_CM;
+
+$CL [$LB8NonBreaks-$CM];
+$CP [$LB8NonBreaks-$CM];
+$EX [$LB8NonBreaks-$CM];
+$IS [$LB8NonBreaks-$CM];
+$SY [$LB8NonBreaks-$CM];
+
+# Rule 13 & 14 taken together for an edge case.
+# Match this, shown forward
+# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
+# This really wants to chain at the $CM+ (which is acting as an $AL)
+# except for $CM chaining being disabled.
+[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
+
+# LB 14 OP SP* x
+#
+$CM* $CAN_CM $SP* $CM* $OP;
+ $CANT_CM $SP* $CM* $OP;
+$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
+
+ $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
+$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
+
+
+
+# LB 15
+$CM* $OP $SP* $CM* $QU;
+
+# LB 16
+$CM* $NS $SP* $CM* ($CL | $CP);
+
+# LB 17
+$CM* $B2 $SP* $CM* $B2;
+
+# LB 18 break after spaces
+# Nothing explicit needed here.
+
+
+#
+# LB 19
+#
+$CM* $QU $CM* $CAN_CM; # . x QU
+$CM* $QU $LB18NonBreaks;
+
+
+$CM* $CAN_CM $CM* $QU; # QU x .
+ $CANT_CM $CM* $QU;
+
+#
+# LB 20 Break before and after CB.
+# nothing needed here.
+#
+
+# LB 20.09 added rule for Finnish tailoring
+$AL ($HY | $HH) / $SP;
+
+# LB 21
+$CM* ($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
+
+$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
+[^$CB] $CM* $BB; #
+
+# LB21a
+[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL;
+
+# LB21b (reverse)
+$CM* $HL $CM* $SY;
+
+# LB 22
+$CM* $IN $CM* ($ALPlus | $HL);
+$CM* $IN $CM* $ID;
+$CM* $IN $CM* $IN;
+$CM* $IN $CM* $NU;
+
+# LB 23
+$CM* $PO $CM* $ID;
+$CM* $NU $CM* ($ALPlus | $HL);
+$CM* ($ALPlus | $HL) $CM* $NU;
+
+# LB 24
+$CM* $ID $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PR;
+$CM* ($ALPlus | $HL) $CM* $PO;
+
+
+# LB 25
+($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
+
+# LB 26
+$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
+$CM* ($JT | $JV) $CM* ($H2 | $JV);
+$CM* $JT $CM* ($H3 | $JT);
+
+# LB 27
+$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+
+# LB 28
+$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
+
+
+# LB 29
+$CM* ($ALPlus | $HL) $CM* $IS;
+
+# LB 30
+$CM* $OP $CM* ($ALPlus | $HL | $NU);
+$CM* ($ALPlus | $HL | $NU) $CM* $CP;
+
+# LB 30a
+$CM* $RI $CM* $RI;
+
+# Special reverse rule for people & body part emoji:
+# don't break between relevant emoji and $EmojiMods
+$EmojiMods $EmojiVar? $EmojiForMods;
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# LB 9
+$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+$CM+ $SP / .;
+
+# LB 14
+$SP+ $CM* $OP;
+
+# LB 15
+$SP+ $CM* $QU;
+
+# LB 16
+$SP+ $CM* ($CL | $CP);
+
+# LB 17
+$SP+ $CM* $B2;
+
+# LB 21
+$CM* ($HY | $BA | $HH) $CM* $HL;
+
+# LB 25
+($CM* ($IS | $SY))+ $CM* $NU;
+($CL | $CP) $CM* ($NU | $IS | $SY);
+
+# For dictionary-based break
+$dictionary $dictionary;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# Skip forward over all character classes that are involved in
+# rules containing patterns with possibly more than one char
+# of context.
+#
+# It might be slightly more efficient to have specific rules
+# instead of one generic one, but only if we could
+# turn off rule chaining. We don't want to move more
+# than necessary.
+#
+[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
+$dictionary $dictionary;
+
boundaries{
grapheme:process(dependency){"char.brk"}
line:process(dependency){"line.brk"}
+ line_loose:process(dependency){"line_loose.brk"}
+ line_normal:process(dependency){"line_normal.brk"}
+ line_strict:process(dependency){"line.brk"}
sentence:process(dependency){"sent.brk"}
title:process(dependency){"title.brk"}
word:process(dependency){"word.brk"}
#
-# Copyright (C) 2002-2013, International Business Machines Corporation
+# Copyright (C) 2002-2015, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word.txt
$Han = [:Han:];
$Hiragana = [:Hiragana:];
-$RI_A = \U0001F1E6; # Trail ERTU
-$RI_B = \U0001F1E7; # Trail EGR
-$RI_C = \U0001F1E8; # Trail AHLNZ
+$RI_A = \U0001F1E6; # Trail ETU
+$RI_B = \U0001F1E7; # Trail ER
+$RI_C = \U0001F1E8; # Trail AHLNO
$RI_D = \U0001F1E9; # Trail EK
-$RI_E = \U0001F1EA; # Trail GS
+$RI_E = \U0001F1EA; # Trail S
$RI_F = \U0001F1EB; # Trail IR
-$RI_G = \U0001F1EC; # Trail BR
-$RI_H = \U0001F1ED; # Trail KU
-$RI_I = \U0001F1EE; # Trail DLNT
+$RI_G = \U0001F1EC; # Trail B
+$RI_H = \U0001F1ED; # Trail K
+$RI_I = \U0001F1EE; # Trail DELNT
$RI_J = \U0001F1EF; # Trail OP
$RI_K = \U0001F1F0; # Trail R
-$RI_L = \U0001F1F1; # Trail B
$RI_M = \U0001F1F2; # Trail OXY
-$RI_N = \U0001F1F3; # Trail LO
-$RI_P = \U0001F1F5; # Trail LT
-$RI_R = \U0001F1F7; # Trail OU
-$RI_S = \U0001F1F8; # Trail AEGK
-$RI_T = \U0001F1F9; # Trail HRW
-$RI_U = \U0001F1FA; # Trail AS
+$RI_N = \U0001F1F3; # Trail LOZ
+$RI_P = \U0001F1F5; # Trail HLRT
+$RI_R = \U0001F1F7; # Trail U
+$RI_S = \U0001F1F8; # Trail AEG
+$RI_T = \U0001F1F9; # Trail R
+$RI_U = \U0001F1FA; # Trail S
$RI_V = \U0001F1FB; # Trail N
+$RI_Z = \U0001F1FF; # Trail A
-$RI_A_End = [\U0001F1EA \U0001F1F7 \U0001F1F9 \U0001F1FA]; # ERTU
-$RI_B_End = [\U0001F1EA \U0001F1EC \U0001F1F7]; # EGR
-$RI_C_End = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1FF]; # AHLNZ
+$RI_A_End = [\U0001F1EA \U0001F1F9 \U0001F1FA]; # ETU
+$RI_B_End = [\U0001F1EA \U0001F1F7]; # ER
+$RI_C_End = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1F4]; # AHLNO
$RI_D_End = [\U0001F1EA \U0001F1F0]; # EK
-$RI_E_End = [\U0001F1EC \U0001F1F8]; # GS
+$RI_E_End = \U0001F1F8; # S
$RI_F_End = [\U0001F1EE \U0001F1F7]; # IR
-$RI_G_End = [\U0001F1E7 \U0001F1F7]; # BR
-$RI_H_End = [\U0001F1F0 \U0001F1FA]; # KU
-$RI_I_End = [\U0001F1E9 \U0001F1F1 \U0001F1F3 \U0001F1F9]; # DLNT
-$RI_J_End = [\U0001F1F4 \U0001F1F5]; # OP
+$RI_G_End = \U0001F1E7; # B
+$RI_H_End = \U0001F1F0; # K
+$RI_I_End = [\U0001F1E9 \U0001F1EA \U0001F1F1 \U0001F1F3 \U0001F1F9]; # DELNT
+$RI_J_End = [\U0001F1F5 \U0001F1F4]; # OP
$RI_K_End = \U0001F1F7; # R
-$RI_L_End = \U0001F1E7; # B
$RI_M_End = [\U0001F1F4 \U0001F1FD \U0001F1FE]; # OXY
-$RI_N_End = [\U0001F1F1 \U0001F1F4]; # LO
-$RI_P_End = [\U0001F1F1 \U0001F1F9]; # LT
-$RI_R_End = [\U0001F1F4 \U0001F1FA]; # OU
-$RI_S_End = [\U0001F1E6 \U0001F1EA \U0001F1EC \U0001F1F0]; # AEGK
-$RI_T_End = [\U0001F1ED \U0001F1F7 \U0001F1FC]; # HRW
-$RI_U_End = [\U0001F1E6 \U0001F1F8]; # AS
+$RI_N_End = [\U0001F1F1 \U0001F1F4 \U0001F1FF]; # LOZ
+$RI_P_End = [\U0001F1ED \U0001F1F1 \U0001F1F7 \U0001F1F9]; # HLRT
+$RI_R_End = \U0001F1FA; # U
+$RI_S_End = [\U0001F1E6 \U0001F1EA \U0001F1EC]; # AEG
+$RI_T_End = \U0001F1F7; # R
+$RI_U_End = \U0001F1F8; # S
$RI_V_End = \U0001F1F3; # N
+$RI_Z_End = \U0001F1E6; # A
+# Special character classes for people & body part emoji:
+# Subsets of $Extend:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of \p{Word_Break = Other}
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
$RI_I ($Extend|$Format)* $RI_I_End ($Extend|$Format)*;
$RI_J ($Extend|$Format)* $RI_J_End ($Extend|$Format)*;
$RI_K ($Extend|$Format)* $RI_K_End ($Extend|$Format)*;
-$RI_L ($Extend|$Format)* $RI_L_End ($Extend|$Format)*;
$RI_M ($Extend|$Format)* $RI_M_End ($Extend|$Format)*;
$RI_N ($Extend|$Format)* $RI_N_End ($Extend|$Format)*;
$RI_P ($Extend|$Format)* $RI_P_End ($Extend|$Format)*;
$RI_T ($Extend|$Format)* $RI_T_End ($Extend|$Format)*;
$RI_U ($Extend|$Format)* $RI_U_End ($Extend|$Format)*;
$RI_V ($Extend|$Format)* $RI_V_End ($Extend|$Format)*;
+$RI_Z ($Extend|$Format)* $RI_Z_End ($Extend|$Format)*;
+
+# Special forward rules for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
+$ZWJ $EmojiForSeqs;
+$EmojiForMods $EmojiVar? $EmojiMods;
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};
($Format|$Extend)* $RI_I_End ($Format|$Extend)* $RI_I;
($Format|$Extend)* $RI_J_End ($Format|$Extend)* $RI_J;
($Format|$Extend)* $RI_K_End ($Format|$Extend)* $RI_K;
-($Format|$Extend)* $RI_L_End ($Format|$Extend)* $RI_L;
($Format|$Extend)* $RI_M_End ($Format|$Extend)* $RI_M;
($Format|$Extend)* $RI_N_End ($Format|$Extend)* $RI_N;
($Format|$Extend)* $RI_P_End ($Format|$Extend)* $RI_P;
($Format|$Extend)* $RI_T_End ($Format|$Extend)* $RI_T;
($Format|$Extend)* $RI_U_End ($Format|$Extend)* $RI_U;
($Format|$Extend)* $RI_V_End ($Format|$Extend)* $RI_V;
+($Format|$Extend)* $RI_Z_End ($Format|$Extend)* $RI_Z;
+
+# Special reverse rules for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
+$EmojiForSeqs $ZWJ;
+$EmojiMods $EmojiVar? $EmojiForMods;
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable;
#
-# Copyright (C) 2002-2013, International Business Machines Corporation
+# Copyright (C) 2002-2015, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word_POSIX.txt
$Han = [:Han:];
$Hiragana = [:Hiragana:];
-$RI_A = \U0001F1E6; # Trail ERTU
-$RI_B = \U0001F1E7; # Trail EGR
-$RI_C = \U0001F1E8; # Trail AHLNZ
+$RI_A = \U0001F1E6; # Trail ETU
+$RI_B = \U0001F1E7; # Trail ER
+$RI_C = \U0001F1E8; # Trail AHLNO
$RI_D = \U0001F1E9; # Trail EK
-$RI_E = \U0001F1EA; # Trail GS
+$RI_E = \U0001F1EA; # Trail S
$RI_F = \U0001F1EB; # Trail IR
-$RI_G = \U0001F1EC; # Trail BR
-$RI_H = \U0001F1ED; # Trail KU
-$RI_I = \U0001F1EE; # Trail DLNT
+$RI_G = \U0001F1EC; # Trail B
+$RI_H = \U0001F1ED; # Trail K
+$RI_I = \U0001F1EE; # Trail DELNT
$RI_J = \U0001F1EF; # Trail OP
$RI_K = \U0001F1F0; # Trail R
-$RI_L = \U0001F1F1; # Trail B
$RI_M = \U0001F1F2; # Trail OXY
-$RI_N = \U0001F1F3; # Trail LO
-$RI_P = \U0001F1F5; # Trail LT
-$RI_R = \U0001F1F7; # Trail OU
-$RI_S = \U0001F1F8; # Trail AEGK
-$RI_T = \U0001F1F9; # Trail HRW
-$RI_U = \U0001F1FA; # Trail AS
+$RI_N = \U0001F1F3; # Trail LOZ
+$RI_P = \U0001F1F5; # Trail HLRT
+$RI_R = \U0001F1F7; # Trail U
+$RI_S = \U0001F1F8; # Trail AEG
+$RI_T = \U0001F1F9; # Trail R
+$RI_U = \U0001F1FA; # Trail S
$RI_V = \U0001F1FB; # Trail N
+$RI_Z = \U0001F1FF; # Trail A
-$RI_A_End = [\U0001F1EA \U0001F1F7 \U0001F1F9 \U0001F1FA]; # ERTU
-$RI_B_End = [\U0001F1EA \U0001F1EC \U0001F1F7]; # EGR
-$RI_C_End = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1FF]; # AHLNZ
+$RI_A_End = [\U0001F1EA \U0001F1F9 \U0001F1FA]; # ETU
+$RI_B_End = [\U0001F1EA \U0001F1F7]; # ER
+$RI_C_End = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1F4]; # AHLNO
$RI_D_End = [\U0001F1EA \U0001F1F0]; # EK
-$RI_E_End = [\U0001F1EC \U0001F1F8]; # GS
+$RI_E_End = \U0001F1F8; # S
$RI_F_End = [\U0001F1EE \U0001F1F7]; # IR
-$RI_G_End = [\U0001F1E7 \U0001F1F7]; # BR
-$RI_H_End = [\U0001F1F0 \U0001F1FA]; # KU
-$RI_I_End = [\U0001F1E9 \U0001F1F1 \U0001F1F3 \U0001F1F9]; # DLNT
-$RI_J_End = [\U0001F1F4 \U0001F1F5]; # OP
+$RI_G_End = \U0001F1E7; # B
+$RI_H_End = \U0001F1F0; # K
+$RI_I_End = [\U0001F1E9 \U0001F1EA \U0001F1F1 \U0001F1F3 \U0001F1F9]; # DELNT
+$RI_J_End = [\U0001F1F5 \U0001F1F4]; # OP
$RI_K_End = \U0001F1F7; # R
-$RI_L_End = \U0001F1E7; # B
$RI_M_End = [\U0001F1F4 \U0001F1FD \U0001F1FE]; # OXY
-$RI_N_End = [\U0001F1F1 \U0001F1F4]; # LO
-$RI_P_End = [\U0001F1F1 \U0001F1F9]; # LT
-$RI_R_End = [\U0001F1F4 \U0001F1FA]; # OU
-$RI_S_End = [\U0001F1E6 \U0001F1EA \U0001F1EC \U0001F1F0]; # AEGK
-$RI_T_End = [\U0001F1ED \U0001F1F7 \U0001F1FC]; # HRW
-$RI_U_End = [\U0001F1E6 \U0001F1F8]; # AS
+$RI_N_End = [\U0001F1F1 \U0001F1F4 \U0001F1FF]; # LOZ
+$RI_P_End = [\U0001F1ED \U0001F1F1 \U0001F1F7 \U0001F1F9]; # HLRT
+$RI_R_End = \U0001F1FA; # U
+$RI_S_End = [\U0001F1E6 \U0001F1EA \U0001F1EC]; # AEG
+$RI_T_End = \U0001F1F7; # R
+$RI_U_End = \U0001F1F8; # S
$RI_V_End = \U0001F1F3; # N
+$RI_Z_End = \U0001F1E6; # A
+# Special character classes for people & body part emoji:
+# Subsets of $Extend:
+$ZWJ = \u200D;
+$EmojiVar = \uFE0F;
+# The following are subsets of \p{Word_Break = Other}
+$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
+$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
+$EmojiMods = [\U0001F3FB-\U0001F3FF];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
$RI_I ($Extend|$Format)* $RI_I_End ($Extend|$Format)*;
$RI_J ($Extend|$Format)* $RI_J_End ($Extend|$Format)*;
$RI_K ($Extend|$Format)* $RI_K_End ($Extend|$Format)*;
-$RI_L ($Extend|$Format)* $RI_L_End ($Extend|$Format)*;
$RI_M ($Extend|$Format)* $RI_M_End ($Extend|$Format)*;
$RI_N ($Extend|$Format)* $RI_N_End ($Extend|$Format)*;
$RI_P ($Extend|$Format)* $RI_P_End ($Extend|$Format)*;
$RI_T ($Extend|$Format)* $RI_T_End ($Extend|$Format)*;
$RI_U ($Extend|$Format)* $RI_U_End ($Extend|$Format)*;
$RI_V ($Extend|$Format)* $RI_V_End ($Extend|$Format)*;
+$RI_Z ($Extend|$Format)* $RI_Z_End ($Extend|$Format)*;
+
+# Special forward rules for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
+$ZWJ $EmojiForSeqs;
+$EmojiForMods $EmojiVar? $EmojiMods;
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};
($Format|$Extend)* $RI_I_End ($Format|$Extend)* $RI_I;
($Format|$Extend)* $RI_J_End ($Format|$Extend)* $RI_J;
($Format|$Extend)* $RI_K_End ($Format|$Extend)* $RI_K;
-($Format|$Extend)* $RI_L_End ($Format|$Extend)* $RI_L;
($Format|$Extend)* $RI_M_End ($Format|$Extend)* $RI_M;
($Format|$Extend)* $RI_N_End ($Format|$Extend)* $RI_N;
($Format|$Extend)* $RI_P_End ($Format|$Extend)* $RI_P;
($Format|$Extend)* $RI_T_End ($Format|$Extend)* $RI_T;
($Format|$Extend)* $RI_U_End ($Format|$Extend)* $RI_U;
($Format|$Extend)* $RI_V_End ($Format|$Extend)* $RI_V;
+($Format|$Extend)* $RI_Z_End ($Format|$Extend)* $RI_Z;
+
+# Special reverse rules for people & body part emoji:
+# don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
+$EmojiForSeqs $ZWJ;
+$EmojiMods $EmojiVar? $EmojiForMods;
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable;
--- /dev/null
+// ***************************************************************************
+// *
+// * Copyright (C) 2014 International Business Machines
+// * Corporation and others. All Rights Reserved.
+// * Tool: org.unicode.cldr.icu.NewLdml2IcuConverter
+// * Source File: <path>/common/segments/zh.xml ../../xml/brkitr/zh.xml
+// *
+// ***************************************************************************
+zh{
+ Version{"2.0.82.42"}
+ boundaries{
+ line:process(dependency){"line.brk"}
+ line_loose:process(dependency){"line_loose_cj.brk"}
+ line_normal:process(dependency){"line_normal_cj.brk"}
+ line_strict:process(dependency){"line.brk"}
+ }
+}
--- /dev/null
+// ***************************************************************************
+// *
+// * Copyright (C) 2014 International Business Machines
+// * Corporation and others. All Rights Reserved.
+// * Tool: org.unicode.cldr.icu.NewLdml2IcuConverter
+// * Source File: <path>/common/segments/zh_Hant.xml ../../xml/brkitr/zh_Hant.xml
+// *
+// ***************************************************************************
+zh_Hant{
+ Version{"2.0.82.42"}
+ boundaries{
+ line:process(dependency){"line.brk"}
+ line_loose:process(dependency){"line_loose_cj.brk"}
+ line_normal:process(dependency){"line_normal_cj.brk"}
+ line_strict:process(dependency){"line.brk"}
+ }
+}
// ***************************************************************************
// *
-// * Copyright (C) 2014 International Business Machines
+// * Copyright (C) 2015 International Business Machines
// * Corporation and others. All Rights Reserved.
// * Tool: org.unicode.cldr.icu.NewLdml2IcuConverter
// * Source File: <path>/supplementalData.xml
}
}
LT{
+ {
+ from:intvector{
+ 330,
+ -1563774976,
+ }
+ id{"EUR"}
+ }
{
from:intvector{
172,
-2062942208,
}
id{"LTL"}
+ to:intvector{
+ 330,
+ -1563774977,
+ }
}
{
from:intvector{
other{"{0} unser"}
}
pound{
- one{"{0} skålpund"}
- other{"{0} skålpund"}
+ one{"{0} pund"}
+ other{"{0} pund"}
}
stone{
one{"{0} stone"}
other{"{0} unser"}
}
pound{
- one{"{0} pund"}
- other{"{0} pund"}
+ one{"{0} lb"}
+ other{"{0} lb"}
}
}
power{
other{"{0} unser"}
}
pound{
- one{"{0} skålpund"}
- other{"{0} skålpund"}
+ one{"{0} pund"}
+ other{"{0} pund"}
}
stone{
one{"{0} st"}
}
minute{
one{"{0} min"}
- other{"{0} mins"}
+ other{"{0} min"}
}
month{
one{"{0} mth"}
middle{"{0}, {1}"}
start{"{0}, {1}"}
}
+ unit-narrow{
+ 2{"{0} {1}"}
+ end{"{0} {1}"}
+ middle{"{0} {1}"}
+ start{"{0} {1}"}
+ }
unit-short{
2{"{0} y {1}"}
end{"{0}, {1}"}
}
duration{
day{
- one{"{0} d"}
- other{"{0} d"}
+ one{"{0}d"}
+ other{"{0}d"}
}
hour{
- one{"{0} h"}
- other{"{0} h"}
+ one{"{0}h"}
+ other{"{0}h"}
}
millisecond{
- one{"{0} ms"}
- other{"{0} ms"}
+ one{"{0}ms"}
+ other{"{0}ms"}
}
minute{
- one{"{0} min"}
- other{"{0} min"}
+ one{"{0}min"}
+ other{"{0}min"}
}
month{
- one{"{0} m"}
- other{"{0} m"}
+ one{"{0}m"}
+ other{"{0}m"}
}
second{
- one{"{0} s"}
- other{"{0} s"}
+ one{"{0}s"}
+ other{"{0}s"}
}
week{
- one{"{0} semana"}
- other{"{0} sem"}
+ one{"{0}sem"}
+ other{"{0}sem"}
}
year{
one{"{0}a"}
- other{"{0} a"}
+ other{"{0}a"}
}
}
energy{
"vie",
"sáb",
}
+ narrow{
+ "D",
+ "L",
+ "M",
+ "M",
+ "J",
+ "V",
+ "S",
+ }
short{
"DO",
"LU",
start{"{0}, {1}"}
}
unit-narrow{
- 2{"{0} y {1}"}
- end{"{0}, {1}"}
- middle{"{0}, {1}"}
- start{"{0}, {1}"}
+ 2{"{0} {1}"}
+ end{"{0} {1}"}
+ middle{"{0} {1}"}
+ start{"{0} {1}"}
}
unit-short{
2{"{0} y {1}"}
}
duration{
day{
- one{"{0} d"}
- other{"{0} d"}
+ one{"{0}d"}
+ other{"{0}d"}
}
hour{
- one{"{0} h"}
- other{"{0} h"}
+ one{"{0}h"}
+ other{"{0}h"}
}
millisecond{
- one{"{0} ms"}
- other{"{0} ms"}
+ one{"{0}ms"}
+ other{"{0}ms"}
}
minute{
- one{"{0} min"}
- other{"{0} min"}
+ one{"{0}min"}
+ other{"{0}min"}
}
month{
- one{"{0} m"}
- other{"{0} m"}
+ one{"{0}m"}
+ other{"{0}m"}
}
week{
- one{"{0} sem"}
- other{"{0} sem"}
+ one{"{0}sem"}
+ other{"{0}sem"}
}
year{
one{"{0}a"}
- other{"{0} a"}
+ other{"{0}a"}
}
}
}
other{"{0} ms"}
}
minute{
- one{"{0} min"}
- other{"{0} min"}
+ one{"{0} mn"}
+ other{"{0} mn"}
}
month{
one{"{0} m"}
}
}
stand-alone{
+ abbreviated{
+ "Moh.",
+ "Saf.",
+ "Rébi I",
+ "Rébi II",
+ "Dsem. I",
+ "Dsem. II",
+ "Red.",
+ "Sab.",
+ "Ram.",
+ "Sev.",
+ "Dsül k.",
+ "Dsül h.",
+ }
narrow{
"1",
"2",
Timezone{"{0} {1}"}
}
availableFormats{
- EHm{"E HH.mm"}
+ EHm{"E HH:mm"}
EHms{"E HH:mm:ss"}
Ed{"E d"}
- Ehm{"E h.mm a"}
+ Ehm{"E h:mm a"}
Ehms{"E h:mm:ss a"}
Gy{"y G"}
GyMMM{"MMM y G"}
start{"{0}, {1}"}
}
unit-narrow{
- 2{"{0}, {1}"}
- end{"{0}, e {1}"}
- middle{"{0}, {1}"}
- start{"{0}, {1}"}
+ 2{"{0} {1}"}
+ end{"{0} {1}"}
+ middle{"{0} {1}"}
+ start{"{0} {1}"}
}
unit-short{
2{"{0}, {1}"}
}
duration{
day{
- one{"{0} gg"}
- other{"{0} gg"}
+ one{"{0}gg"}
+ other{"{0}gg"}
}
hour{
- one{"{0} h"}
- other{"{0} h"}
+ one{"{0}h"}
+ other{"{0}h"}
}
millisecond{
- one{"{0} ms"}
- other{"{0} ms"}
+ one{"{0}ms"}
+ other{"{0}ms"}
}
minute{
- one{"{0} m"}
- other{"{0} m"}
+ one{"{0}min"}
+ other{"{0}min"}
}
month{
- one{"{0} mesi"}
- other{"{0} mesi"}
+ one{"{0}mesi"}
+ other{"{0}mesi"}
}
second{
- one{"{0} s"}
- other{"{0} s"}
+ one{"{0}s"}
+ other{"{0}s"}
}
week{
- one{"{0} sett."}
- other{"{0} sett."}
+ one{"{0}sett."}
+ other{"{0}sett."}
}
year{
- one{"{0} anno"}
- other{"{0} anni"}
+ one{"{0}anno"}
+ other{"{0}anni"}
}
}
energy{
calendar{
generic{
DateTimePatterns{
- "HH.mm:ss 'h' zzzz",
+ "HH:mm:ss 'h' zzzz",
"HH:mm:ss z",
"HH:mm:ss",
"HH:mm",
}
gregorian{
DateTimePatterns{
- "HH.mm:ss 'h' zzzz",
+ "HH:mm:ss 'h' zzzz",
"HH:mm:ss z",
"HH:mm:ss",
"HH:mm",
}
length{
centimeter{
- other{"{0} sm"}
+ other{"{0} cm"}
}
foot{
other{"{0}'"}
}
length{
centimeter{
- other{"{0} sm"}
+ other{"{0} cm"}
}
foot{
other{"{0} ka"}
}
}
contextTransforms{
+ day-format-except-narrow:intvector{
+ 0,
+ 1,
+ }
+ day-standalone-except-narrow:intvector{
+ 0,
+ 1,
+ }
month-format-except-narrow:intvector{
0,
1,
}
monthNames{
format{
+ abbreviated{
+ "muharram",
+ "safar",
+ "rabi’ al-awwal",
+ "rabi’ al-akhir",
+ "jumada-l-ula",
+ "jumada-l-akhira",
+ "rajab",
+ "sha’ban",
+ "ramadan",
+ "shawwal",
+ "dhu-l-ga’da",
+ "dhu-l-hijja",
+ }
wide{
"muharram",
"safar",
}
}
stand-alone{
+ abbreviated{
+ "Muharram",
+ "Safar",
+ "Rabi’ al-awwal",
+ "Rabi’ al-akhir",
+ "Jumada-l-ula",
+ "Jumada-l-akhira",
+ "Rajab",
+ "Sha’ban",
+ "Ramadan",
+ "Shawwal",
+ "Dhu-l-ga’da",
+ "Dhu-l-hijja",
+ }
wide{
"Muharram",
"Safar",
}
}
stand-alone{
+ abbreviated{
+ "Muharrem",
+ "Safer",
+ "Rebiülevvel",
+ "Rebiülahir",
+ "Cemaziyelevvel",
+ "Cemaziyelahir",
+ "Recep",
+ "Şaban",
+ "Ramazan",
+ "Şevval",
+ "Zilkade",
+ "Zilhicce",
+ }
narrow{
"1",
"2",
"Gy年M月d日",
"Gy年M月d日",
"d/M/yyGGGGG",
- "{1}{0}",
- "{1}{0}",
- "{1}{0}",
- "{1}{0}",
- "{1}{0}",
+ "{1} {0}",
+ "{1} {0}",
+ "{1} {0}",
+ "{1} {0}",
+ "{1} {0}",
}
availableFormats{
HHmm{"HH:mm"}
"y年M月d日",
"d/M/yy",
"{1} {0}",
- "{1}{0}",
- "{1}{0}",
+ "{1} {0}",
+ "{1} {0}",
"{1} {0}",
"{1} {0}",
}
"y年M月d日",
"d/M/yy",
"{1} {0}",
- "{1}{0}",
- "{1}{0}",
+ "{1} {0}",
+ "{1} {0}",
"{1} {0}",
"{1} {0}",
}
start{"{0}、{1}"}
}
unit{
+ 2{"{0} {1}"}
+ end{"{0} {1}"}
+ middle{"{0} {1}"}
+ start{"{0} {1}"}
+ }
+ unit-narrow{
2{"{0}{1}"}
end{"{0}{1}"}
middle{"{0}{1}"}
start{"{0}{1}"}
}
unit-short{
- 2{"{0}{1}"}
- end{"{0}{1}"}
- middle{"{0}{1}"}
- start{"{0}{1}"}
+ 2{"{0} {1}"}
+ end{"{0} {1}"}
+ middle{"{0} {1}"}
+ start{"{0} {1}"}
}
}
measurementSystemNames{
}
duration{
day{
- other{"{0} 天"}
+ other{"{0}天"}
}
hour{
- other{"{0} 小時"}
+ other{"{0}時"}
}
millisecond{
- other{"{0} 毫秒"}
+ other{"{0}毫秒"}
}
minute{
- other{"{0} 分鐘"}
+ other{"{0}分"}
}
month{
- other{"{0} 個月"}
+ other{"{0}個月"}
}
second{
- other{"{0} 秒"}
+ other{"{0}秒"}
}
week{
- other{"{0} 週"}
+ other{"{0}週"}
}
year{
- other{"{0} 年"}
+ other{"{0}年"}
}
}
energy{
"Gy年M月d日",
"Gy年M月d日",
"d/M/yGGGGG",
- "{1}{0}",
"{1} {0}",
"{1} {0}",
- "{1}{0}",
- "{1}{0}",
+ "{1} {0}",
+ "{1} {0}",
+ "{1} {0}",
}
availableFormats{
Ed{"d E"}
"y年M月d日",
"y年M月d日",
"d/M/y",
- "{1}{0}",
"{1} {0}",
"{1} {0}",
- "{1}{0}",
- "{1}{0}",
+ "{1} {0}",
+ "{1} {0}",
+ "{1} {0}",
}
availableFormats{
Ed{"d E"}
}
duration{
week{
- other{"{0}星期"}
+ other{"{0} 星期"}
}
}
energy{
"Gy年M月d日",
"Gy年M月d日",
"d/M/yGGGGG",
- "{1}{0}",
"{1} {0}",
"{1} {0}",
- "{1}{0}",
- "{1}{0}",
+ "{1} {0}",
+ "{1} {0}",
+ "{1} {0}",
}
availableFormats{
Ed{"d E"}
"y年M月d日",
"y年M月d日",
"d/M/y",
- "{1}{0}",
"{1} {0}",
"{1} {0}",
- "{1}{0}",
- "{1}{0}",
+ "{1} {0}",
+ "{1} {0}",
+ "{1} {0}",
}
availableFormats{
Ed{"d E"}
// ***************************************************************************
// *
-// * Copyright (C) 2014 International Business Machines
+// * Copyright (C) 2015 International Business Machines
// * Corporation and others. All Rights Reserved.
// * Tool: org.unicode.cldr.icu.NewLdml2IcuConverter
// * Source File: <path>/metaZones.xml
{
"America_Central",
"1998-08-02 06:00",
+ "2015-02-01 08:00",
+ }
+ {
+ "America_Eastern",
+ "2015-02-01 08:00",
"9999-12-31 23:59",
}
}
"Asia:Kamchatka"{
{
"Kamchatka",
- "1970-01-01 00:00",
- "2010-03-27 14:00",
- }
- {
- "Magadan",
- "2010-03-27 14:00",
- "9999-12-31 23:59",
}
}
"Asia:Karachi"{
{
"Samara",
"1991-10-20 00:00",
- "2010-03-27 22:00",
- }
- {
- "Moscow",
- "2010-03-27 22:00",
"9999-12-31 23:59",
}
}
// ***************************************************************************
// *
-// * Copyright (C) 2014 International Business Machines
+// * Copyright (C) 2015 International Business Machines
// * Corporation and others. All Rights Reserved.
// * Tool: org.unicode.cldr.icu.NewLdml2IcuConverter
// * Source File: <path>/windowsZones.xml
AU{"Antarctica/Macquarie"}
FM{"Pacific/Ponape Pacific/Kosrae"}
NC{"Pacific/Noumea"}
+ PG{"Pacific/Bougainville"}
SB{"Pacific/Guadalcanal"}
VU{"Pacific/Efate"}
ZZ{"Etc/GMT-11"}
"Central Standard Time (Mexico)"{
001{"America/Mexico_City"}
MX{
- "America/Mexico_City America/Bahia_Banderas America/Cancun America/Me"
- "rida America/Monterrey"
+ "America/Mexico_City America/Bahia_Banderas America/Merida America/Mo"
+ "nterrey"
}
}
"Central Standard Time"{
EC{"America/Guayaquil"}
JM{"America/Jamaica"}
KY{"America/Cayman"}
+ MX{"America/Cancun"}
PA{"America/Panama"}
PE{"America/Lima"}
ZZ{"Etc/GMT+5"}
FM{"Pacific/Truk"}
GU{"Pacific/Guam"}
MP{"Pacific/Saipan"}
- PG{"Pacific/Port_Moresby Pacific/Bougainville"}
+ PG{"Pacific/Port_Moresby"}
ZZ{"Etc/GMT-10"}
}
"Yakutsk Standard Time"{
//---------------------------------------------------------
-// Copyright (C) 2003-2014, International Business Machines
+// Copyright (C) 2003-2015, International Business Machines
// Corporation and others. All Rights Reserved.
//---------------------------------------------------------
// Build tool: tz2icu
-// Build date: Tue Nov 11 12:33:07 2014
+// Build date: Tue Feb 3 16:54:37 2015
// tz database: ftp://ftp.iana.org/tz/
-// tz version: 2014j
+// tz version: 2015a
// ICU version: 55.0.1
//---------------------------------------------------------
// >> !!! >> THIS IS A MACHINE-GENERATED FILE << !!! <<
//---------------------------------------------------------
zoneinfo64:table(nofallback) {
- TZVersion { "2014j" }
+ TZVersion { "2015a" }
Zones:array {
/* ACT */ :int { 347 } //Z#0
/* AET */ :int { 359 } //Z#1
finalYear:int { 2039 }
} //Z#92
/* America/Cancun */ :table {
- trans:intvector { -1514743200, 377935200, 828860400, 846396000, 860310000, 877845600, 891759600, 902037600, 909298800, 923212800, 941353200, 954662400, 972802800, 989136000, 1001833200, 1018166400, 1035702000 }
+ trans:intvector { -1514743200, 377935200, 828860400, 846396000, 860310000, 877845600, 891759600, 902037600, 909298800, 923212800, 941353200, 954662400, 972802800, 989136000, 1001833200, 1018166400, 1035702000, 1049616000, 1067151600, 1081065600, 1099206000, 1112515200, 1130655600, 1143964800, 1162105200, 1175414400, 1193554800, 1207468800, 1225004400, 1238918400, 1256454000, 1270368000, 1288508400, 1301817600, 1319958000, 1333267200, 1351407600, 1365321600, 1382857200, 1396771200, 1414306800, 1422777600 }
typeOffsets:intvector { -20824, 0, -21600, 0, -21600, 3600, -18000, 0, -18000, 3600 }
- typeMap:bin { "0103040304030402010201020102010201" }
- finalRule { "Mexico" }
- finalRaw:int { -21600 }
- finalYear:int { 2003 }
+ typeMap:bin { "010304030403040201020102010201020102010201020102010201020102010201020102010201020103" }
} //Z#93
/* America/Caracas */ :table {
transPre32:intvector { -1, 1770461760 }
typeMap:bin { "010201020102010201020102010201020102010201020102010201020103" }
} //Z#197
/* America/Santiago */ :table {
- trans:intvector { -1893439034, -1688410800, -1619983034, -1593806400, -1335986234, -1317585600, -1304362800, -1286049600, -1272826800, -1254513600, -1241290800, -1222977600, -1209754800, -1191355200, -1178132400, -870552000, -865278000, -736376400, -713646000, -36619200, -23922000, -3355200, 7527600, 24465600, 37767600, 55915200, 69217200, 87969600, 100666800, 118209600, 132116400, 150868800, 163566000, 182318400, 195620400, 213768000, 227070000, 245217600, 258519600, 277272000, 289969200, 308721600, 321418800, 340171200, 353473200, 371620800, 384922800, 403070400, 416372400, 434520000, 447822000, 466574400, 479271600, 498024000, 510721200, 529473600, 545194800, 560923200, 574225200, 591768000, 605674800, 624427200, 637729200, 653457600, 668574000, 687326400, 700628400, 718776000, 732078000, 750225600, 763527600, 781675200, 794977200, 813729600, 826426800, 845179200, 859690800, 876628800, 889930800, 906868800, 923194800, 939528000, 952830000, 971582400, 984279600, 1003032000, 1015729200, 1034481600, 1047178800, 1065931200, 1079233200, 1097380800, 1110682800, 1128830400, 1142132400, 1160884800, 1173582000, 1192334400, 1206846000, 1223784000, 1237086000, 1255233600, 1270350000, 1286683200, 1304823600, 1313899200, 1335668400, 1346558400 }
- typeOffsets:intvector { -16966, 0, -18000, 0, -18000, 3600, -14400, 0, -14400, 3600 }
- typeMap:bin { "010003000201020102010201020102010201030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304" }
- finalRule { "Chile" }
- finalRaw:int { -14400 }
- finalYear:int { 2013 }
+ trans:intvector { -1893439034, -1688410800, -1619983034, -1593806400, -1335986234, -1317585600, -1304362800, -1286049600, -1272826800, -1254513600, -1241290800, -1222977600, -1209754800, -1191355200, -1178132400, -870552000, -865278000, -736376400, -713646000, -36619200, -23922000, -3355200, 7527600, 24465600, 37767600, 55915200, 69217200, 87969600, 100666800, 118209600, 132116400, 150868800, 163566000, 182318400, 195620400, 213768000, 227070000, 245217600, 258519600, 277272000, 289969200, 308721600, 321418800, 340171200, 353473200, 371620800, 384922800, 403070400, 416372400, 434520000, 447822000, 466574400, 479271600, 498024000, 510721200, 529473600, 545194800, 560923200, 574225200, 591768000, 605674800, 624427200, 637729200, 653457600, 668574000, 687326400, 700628400, 718776000, 732078000, 750225600, 763527600, 781675200, 794977200, 813729600, 826426800, 845179200, 859690800, 876628800, 889930800, 906868800, 923194800, 939528000, 952830000, 971582400, 984279600, 1003032000, 1015729200, 1034481600, 1047178800, 1065931200, 1079233200, 1097380800, 1110682800, 1128830400, 1142132400, 1160884800, 1173582000, 1192334400, 1206846000, 1223784000, 1237086000, 1255233600, 1270350000, 1286683200, 1304823600, 1313899200, 1335668400, 1346558400, 1367118000, 1378612800, 1398567600, 1410062400, 1430017200 }
+ typeOffsets:intvector { -16966, 0, -18000, 0, -18000, 3600, -14400, 0, -14400, 3600, -10800, 0 }
+ typeMap:bin { "0100030002010201020102010201020102010304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030403040304030405" }
links:intvector { 198, 385 }
} //Z#198
/* America/Santo_Domingo */ :table {
} //Z#227
/* Antarctica/McMurdo */ :int { 540 } //Z#228
/* Antarctica/Palmer */ :table {
- trans:intvector { -157766400, -152658000, -132955200, -121122000, -101419200, -86821200, -71092800, -54766800, -39038400, -23317200, -7588800, 128142000, 136605600, 389070000, 403070400, 416372400, 434520000, 447822000, 466574400, 479271600, 498024000, 510721200, 529473600, 545194800, 560923200, 574225200, 591768000, 605674800, 624427200, 637729200, 653457600, 668574000, 687326400, 700628400, 718776000, 732078000, 750225600, 763527600, 781675200, 794977200, 813729600, 826426800, 845179200, 859690800, 876628800, 889930800, 906868800, 923194800, 939528000, 952830000, 971582400, 984279600, 1003032000, 1015729200, 1034481600, 1047178800, 1065931200, 1079233200, 1097380800, 1110682800, 1128830400, 1142132400, 1160884800, 1173582000, 1192334400, 1206846000, 1223784000, 1237086000, 1255233600, 1270350000, 1286683200, 1304823600, 1313899200, 1335668400, 1346558400 }
+ trans:intvector { -157766400, -152658000, -132955200, -121122000, -101419200, -86821200, -71092800, -54766800, -39038400, -23317200, -7588800, 128142000, 136605600, 389070000, 403070400, 416372400, 434520000, 447822000, 466574400, 479271600, 498024000, 510721200, 529473600, 545194800, 560923200, 574225200, 591768000, 605674800, 624427200, 637729200, 653457600, 668574000, 687326400, 700628400, 718776000, 732078000, 750225600, 763527600, 781675200, 794977200, 813729600, 826426800, 845179200, 859690800, 876628800, 889930800, 906868800, 923194800, 939528000, 952830000, 971582400, 984279600, 1003032000, 1015729200, 1034481600, 1047178800, 1065931200, 1079233200, 1097380800, 1110682800, 1128830400, 1142132400, 1160884800, 1173582000, 1192334400, 1206846000, 1223784000, 1237086000, 1255233600, 1270350000, 1286683200, 1304823600, 1313899200, 1335668400, 1346558400, 1367118000, 1378612800, 1398567600, 1410062400, 1430017200 }
typeOffsets:intvector { 0, 0, -14400, 0, -14400, 3600, -10800, 0, -10800, 3600 }
- typeMap:bin { "020102010201020102010304030102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102" }
- finalRule { "ChileAQ" }
- finalRaw:int { -14400 }
- finalYear:int { 2013 }
+ typeMap:bin { "0201020102010201020103040301020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010203" }
} //Z#229
/* Antarctica/Rothera */ :table {
trans:intvector { 218246400 }
typeMap:bin { "01" }
} //Z#234
/* Arctic/Longyearbyen */ :int { 464 } //Z#235
- /* Asia/Aden */ :table {
- trans:intvector { -631162794 }
- typeOffsets:intvector { 10794, 0, 10800, 0 }
- typeMap:bin { "01" }
- } //Z#236
+ /* Asia/Aden */ :int { 303 } //Z#236
/* Asia/Almaty */ :table {
trans:intvector { -1441170468, -1247547600, 354909600, 370717200, 386445600, 402253200, 417981600, 433789200, 449604000, 465336000, 481060800, 496785600, 512510400, 528235200, 543960000, 559684800, 575409600, 591134400, 606859200, 622584000, 638308800, 654638400, 701802000, 717523200, 733262400, 748987200, 764712000, 780436800, 796161600, 811886400, 828216000, 846360000, 859665600, 877809600, 891115200, 909259200, 922564800, 941313600, 954014400, 972763200, 985464000, 1004212800, 1017518400, 1035662400, 1048968000, 1067112000, 1080417600, 1099166400 }
typeOffsets:intvector { 18468, 0, 18000, 0, 21600, 0, 21600, 3600 }
typeOffsets:intvector { 10660, 0, 10656, 0, 10800, 0, 10800, 3600 }
typeMap:bin { "010203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302" }
} //Z#244
- /* Asia/Bahrain */ :table {
- trans:intvector { -1577935340, 76190400 }
- typeOffsets:intvector { 12140, 0, 10800, 0, 14400, 0 }
- typeMap:bin { "0201" }
- } //Z#245
+ /* Asia/Bahrain */ :int { 300 } //Z#245
/* Asia/Baku */ :table {
trans:intvector { -1441163964, -405140400, 354916800, 370724400, 386452800, 402260400, 417988800, 433796400, 449611200, 465343200, 481068000, 496792800, 512517600, 528242400, 543967200, 559692000, 575416800, 591141600, 606866400, 622591200, 638316000, 654645600, 670370400, 686098800, 701812800, 717534000, 828234000, 846378000, 859680000, 877824000 }
typeOffsets:intvector { 11964, 0, 10800, 0, 10800, 3600, 14400, 0, 14400, 3600 }
trans:intvector { -1577936472 }
typeOffsets:intvector { 13272, 0, 14400, 0 }
typeMap:bin { "01" }
+ links:intvector { 261, 291 }
} //Z#261
/* Asia/Dushanbe */ :table {
trans:intvector { -1441168512, -1247547600, 354909600, 370717200, 386445600, 402253200, 417981600, 433789200, 449604000, 465336000, 481060800, 496785600, 512510400, 528235200, 543960000, 559684800, 575409600, 591134400, 606859200, 622584000, 638308800, 654638400, 670363200, 684363600 }
typeOffsets:intvector { 26480, 0, 27000, 0, 28800, 0, 28800, 1200, 32400, 0 }
typeMap:bin { "010203020302030203020302030203020402" }
} //Z#284
- /* Asia/Kuwait */ :table {
- trans:intvector { -631163516 }
- typeOffsets:intvector { 11516, 0, 10800, 0 }
- typeMap:bin { "01" }
- } //Z#285
+ /* Asia/Kuwait */ :int { 303 } //Z#285
/* Asia/Macao */ :int { 287 } //Z#286
/* Asia/Macau */ :table {
trans:intvector { -1830411260, -277360200, -257405400, -245910600, -225955800, -214473600, -194506200, -182406600, -163056600, -150969600, -131619600, -117088200, -101367000, -85638600, -69312600, -53584200, -37863000, -22134600, -6413400, 9315000, 25036200, 40764600, 56485800, 72201600, 87922800, 103651200, 119977200, 135705600, 151439400, 167167800, 182889000, 198617400, 214338600, 230067000, 245788200, 261504000, 277225200, 292953600, 309279600, 325008000, 340729200 }
typeOffsets:intvector { -57360, 0, 28800, 0, 28800, 3600, 29040, 0, 32400, 0 }
typeMap:bin { "03010201040102010201" }
} //Z#290
- /* Asia/Muscat */ :table {
- trans:intvector { -1577937264 }
- typeOffsets:intvector { 14064, 0, 14400, 0 }
- typeMap:bin { "01" }
- } //Z#291
+ /* Asia/Muscat */ :int { 261 } //Z#291
/* Asia/Nicosia */ :table {
trans:intvector { -1518920008, 166572000, 182293200, 200959200, 213829200, 228866400, 243982800, 260316000, 276123600, 291765600, 307486800, 323820000, 338936400, 354664800, 370386000, 386114400, 401835600, 417564000, 433285200, 449013600, 465339600, 481068000, 496789200, 512517600, 528238800, 543967200, 559688400, 575416800, 591138000, 606866400, 622587600, 638316000, 654642000, 670370400, 686091600, 701820000, 717541200, 733269600, 748990800, 764719200, 780440400, 796168800, 811890000, 828223200, 843944400, 859672800, 875394000, 891122400, 909277200, 922582800, 941331600 }
typeOffsets:intvector { 8008, 0, 7200, 0, 7200, 3600 }
trans:intvector { -1577935568, 76190400 }
typeOffsets:intvector { 12368, 0, 10800, 0, 14400, 0 }
typeMap:bin { "0201" }
+ links:intvector { 245, 300 }
} //Z#300
/* Asia/Qyzylorda */ :table {
trans:intvector { -1441167712, -1247544000, 354913200, 370720800, 386445600, 402256800, 417985200, 433792800, 449607600, 465339600, 481064400, 496789200, 512514000, 528238800, 543963600, 559688400, 575413200, 591138000, 606862800, 622587600, 638312400, 654642000, 695768400, 701802000, 717523200, 733262400, 748987200, 764712000, 780436800, 796161600, 811886400, 828216000, 846360000, 859665600, 877809600, 891115200, 909259200, 922564800, 941313600, 954014400, 972763200, 985464000, 1004212800, 1017518400, 1035662400, 1048968000, 1067112000, 1080417600, 1099166400 }
trans:intvector { -719636812 }
typeOffsets:intvector { 11212, 0, 10800, 0 }
typeMap:bin { "01" }
+ links:intvector { 236, 285, 303 }
} //Z#303
/* Asia/Saigon */ :int { 266 } //Z#304
/* Asia/Sakhalin */ :table {
finalYear:int { 1997 }
} //Z#336
/* Atlantic/Reykjavik */ :table {
- transPre32:intvector { -1, 97919740 }
- trans:intvector { -1956609132, -1668211200, -1647212400, -1636675200, -1613430000, -968025600, -949615200, -942008400, -920239200, -909957600, -888789600, -877903200, -857944800, -846453600, -826495200, -815004000, -795045600, -783554400, -762991200, -752104800, -731541600, -717631200, -700092000, -686181600, -668642400, -654732000, -636588000, -623282400, -605743200, -591832800, -573688800, -559778400, -542239200, -528328800, -510789600, -496879200, -479340000, -465429600, -447890400, -433980000, -415836000, -401925600, -384386400, -370476000, -352936800, -339026400, -321487200, -307576800, -290037600, -276127200, -258588000, -244677600, -226533600, -212623200, -195084000, -181173600, -163634400, -149724000, -132184800, -118274400, -100735200, -86824800, -68680800, -54770400 }
- typeOffsets:intvector { -5244, 0, -5268, 0, -3600, 0, -3600, 3600, 0, 0 }
- typeMap:bin { "0102030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030203020302030204" }
+ trans:intvector { -1956609120, -1668211200, -1647212400, -1636675200, -1613430000, -1605139200, -1581894000, -1539561600, -1531350000, -968025600, -952293600, -942008400, -920239200, -909957600, -888789600, -877903200, -857944800, -846453600, -826495200, -815004000, -795045600, -783554400, -762991200, -752104800, -731541600, -717631200, -700092000, -686181600, -668642400, -654732000, -636588000, -623282400, -605743200, -591832800, -573688800, -559778400, -542239200, -528328800, -510789600, -496879200, -479340000, -465429600, -447890400, -433980000, -415836000, -401925600, -384386400, -370476000, -352936800, -339026400, -321487200, -307576800, -290037600, -276127200, -258588000, -244677600, -226533600, -212623200, -195084000, -181173600, -163634400, -149724000, -132184800, -118274400, -100735200, -86824800, -68680800, -54770400 }
+ typeOffsets:intvector { -5280, 0, -3600, 0, -3600, 3600, 0, 0 }
+ typeMap:bin { "0102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020102010201020103" }
links:intvector { 337, 502 }
} //Z#337
/* Atlantic/South_Georgia */ :table {
} //Z#543
/* Pacific/Easter */ :table {
transPre32:intvector { -1, 1770471960 }
- trans:intvector { -1178124152, -870552000, -865278000, -736376400, -36619200, -23922000, -3355200, 7527600, 24465600, 37767600, 55915200, 69217200, 87969600, 100666800, 118209600, 132116400, 150868800, 163566000, 182318400, 195620400, 213768000, 227070000, 245217600, 258519600, 277272000, 289969200, 308721600, 321418800, 340171200, 353473200, 371620800, 384922800, 403070400, 416372400, 434520000, 447822000, 466574400, 479271600, 498024000, 510721200, 529473600, 545194800, 560923200, 574225200, 591768000, 605674800, 624427200, 637729200, 653457600, 668574000, 687326400, 700628400, 718776000, 732078000, 750225600, 763527600, 781675200, 794977200, 813729600, 826426800, 845179200, 859690800, 876628800, 889930800, 906868800, 923194800, 939528000, 952830000, 971582400, 984279600, 1003032000, 1015729200, 1034481600, 1047178800, 1065931200, 1079233200, 1097380800, 1110682800, 1128830400, 1142132400, 1160884800, 1173582000, 1192334400, 1206846000, 1223784000, 1237086000, 1255233600, 1270350000, 1286683200, 1304823600, 1313899200, 1335668400, 1346558400 }
- typeOffsets:intvector { -26264, 0, -26248, 0, -25200, 0, -25200, 3600, -21600, 0, -21600, 3600 }
- typeMap:bin { "01030203020302030203020302030203020302030203020302030203020302030405040504050405040504050405040504050405040504050405040504050405040504050405040504050405040504050405040504050405040504050405" }
- finalRule { "Chile" }
- finalRaw:int { -21600 }
- finalYear:int { 2013 }
+ trans:intvector { -1178124152, -870552000, -865278000, -736376400, -36619200, -23922000, -3355200, 7527600, 24465600, 37767600, 55915200, 69217200, 87969600, 100666800, 118209600, 132116400, 150868800, 163566000, 182318400, 195620400, 213768000, 227070000, 245217600, 258519600, 277272000, 289969200, 308721600, 321418800, 340171200, 353473200, 371620800, 384836400, 384922800, 403070400, 416372400, 434520000, 447822000, 466574400, 479271600, 498024000, 510721200, 529473600, 545194800, 560923200, 574225200, 591768000, 605674800, 624427200, 637729200, 653457600, 668574000, 687326400, 700628400, 718776000, 732078000, 750225600, 763527600, 781675200, 794977200, 813729600, 826426800, 845179200, 859690800, 876628800, 889930800, 906868800, 923194800, 939528000, 952830000, 971582400, 984279600, 1003032000, 1015729200, 1034481600, 1047178800, 1065931200, 1079233200, 1097380800, 1110682800, 1128830400, 1142132400, 1160884800, 1173582000, 1192334400, 1206846000, 1223784000, 1237086000, 1255233600, 1270350000, 1286683200, 1304823600, 1313899200, 1335668400, 1346558400, 1367118000, 1378612800, 1398567600, 1410062400, 1430017200 }
+ typeOffsets:intvector { -26264, 0, -26248, 0, -25200, 0, -25200, 3600, -21600, 0, -21600, 3600, -18000, 0 }
+ typeMap:bin { "01030203020302030203020302030203020302030203020302030203020302030504050405040504050405040504050405040504050405040504050405040504050405040504050405040504050405040504050405040504050405040504050405040506" }
links:intvector { 386, 544 }
} //Z#544
/* Pacific/Efate */ :table {
Chatham:intvector {
8, -30, -1, 9900, 1, 3, 1, -1, 9900, 1, 3600
} //_#8
- Chile:intvector {
- 8, 2, -1, 14400, 2, 3, 23, -1, 10800, 2, 3600
- } //_#9
- ChileAQ:intvector {
- 8, 2, -1, 14400, 2, 3, 23, -1, 10800, 2, 3600
- } //_#10
Cuba:intvector {
2, 8, -1, 0, 1, 10, 1, -1, 0, 1, 3600
- } //_#11
+ } //_#9
EU:intvector {
2, -31, -1, 3600, 2, 9, -31, -1, 3600, 2, 3600
- } //_#12
+ } //_#10
EUAsia:intvector {
2, -31, -1, 3600, 2, 9, -31, -1, 3600, 2, 3600
- } //_#13
+ } //_#11
Egypt:intvector {
3, -30, -6, 0, 1, 8, -30, -5, 86400, 0, 3600
- } //_#14
+ } //_#12
Fiji:intvector {
10, 1, -1, 7200, 0, 0, 18, -1, 10800, 0, 3600
- } //_#15
+ } //_#13
Haiti:intvector {
2, 8, -1, 7200, 0, 10, 1, -1, 7200, 0, 3600
- } //_#16
+ } //_#14
Jordan:intvector {
2, -31, -5, 86400, 0, 9, -31, -6, 0, 1, 3600
- } //_#17
+ } //_#15
LH:intvector {
9, 1, -1, 7200, 0, 3, 1, -1, 7200, 0, 1800
- } //_#18
+ } //_#16
Lebanon:intvector {
2, -31, -1, 0, 0, 9, -31, -1, 0, 0, 3600
- } //_#19
+ } //_#17
Mexico:intvector {
3, 1, -1, 7200, 0, 9, -31, -1, 7200, 0, 3600
- } //_#20
+ } //_#18
Morocco:intvector {
2, -31, -1, 7200, 0, 9, -31, -1, 10800, 0, 3600
- } //_#21
+ } //_#19
NZ:intvector {
8, -30, -1, 7200, 1, 3, 1, -1, 7200, 1, 3600
- } //_#22
+ } //_#20
Namibia:intvector {
8, 1, -1, 7200, 0, 3, 1, -1, 7200, 0, 3600
- } //_#23
+ } //_#21
Palestine:intvector {
2, -31, -5, 86400, 0, 8, 21, -6, 0, 0, 3600
- } //_#24
+ } //_#22
Para:intvector {
9, 1, -1, 0, 0, 2, 22, -1, 0, 0, 3600
- } //_#25
+ } //_#23
Syria:intvector {
2, -31, -6, 0, 0, 9, -31, -6, 0, 0, 3600
- } //_#26
+ } //_#24
SystemV:intvector {
3, -30, -1, 7200, 0, 9, -31, -1, 7200, 0, 3600
- } //_#27
+ } //_#25
Thule:intvector {
2, 8, -1, 7200, 0, 10, 1, -1, 7200, 0, 3600
- } //_#28
+ } //_#26
Troll:intvector {
2, -31, -1, 3600, 2, 9, -31, -1, 3600, 2, 7200
- } //_#29
+ } //_#27
US:intvector {
2, 8, -1, 7200, 0, 10, 1, -1, 7200, 0, 3600
- } //_#30
+ } //_#28
Uruguay:intvector {
9, 1, -1, 7200, 0, 2, 8, -1, 7200, 0, 3600
- } //_#31
+ } //_#29
WS:intvector {
8, -30, -1, 10800, 0, 3, 1, -1, 14400, 0, 3600
- } //_#32
+ } //_#30
Zion:intvector {
2, 23, -6, 7200, 0, 9, -31, -1, 7200, 0, 3600
- } //_#33
+ } //_#31
}
Regions:array {
"AU", //Z#0 ACT
---- base_unidata/DerivedCoreProperties.txt 2014-03-09 16:20:00.000000000 -0700
-+++ DerivedCoreProperties.txt 2014-03-10 06:35:56.000000000 -0700
+--- base_unidata/DerivedCoreProperties.txt 2014-03-16 23:30:07.000000000 -0700
++++ DerivedCoreProperties.txt 2014-10-31 21:56:40.000000000 -0700
@@ -162,6 +162,9 @@
29FE..2AFF ; Math
2B30..2B44 ; Math
F900..FA6D ; Grapheme_Base
FA70..FAD9 ; Grapheme_Base
FB00..FB06 ; Grapheme_Base
+@@ -9979,7 +9998,7 @@
+ 1F3A0..1F3C4 ; Grapheme_Base
+ 1F3C6..1F3CA ; Grapheme_Base
+ 1F3E0..1F3F0 ; Grapheme_Base
+-1F400..1F43E ; Grapheme_Base
++1F3FB..1F43E ; Grapheme_Base
+ 1F440 ; Grapheme_Base
+ 1F442..1F4F7 ; Grapheme_Base
+ 1F4F9..1F4FC ; Grapheme_Base
@@ -9995,7 +10014,7 @@
2B740..2B81D ; Grapheme_Base
2F800..2FA1D ; Grapheme_Base
# ================================================
---- base_unidata/ppucd.txt 2014-03-09 16:20:00.000000000 -0700
-+++ ppucd.txt 2014-03-10 06:34:27.000000000 -0700
+--- base_unidata/ppucd.txt 2014-03-16 23:30:07.000000000 -0700
++++ ppucd.txt 2014-10-31 22:12:22.000000000 -0700
@@ -17833,8 +17833,47 @@
block;DC00..DFFF;age=2.0;blk=Low_Surrogates;gc=Cs;GCB=CN;lb=SG
# DC00..DFFF Low Surrogates
block;F900..FAFF;age=1.1;Alpha;blk=CJK_Compat_Ideographs;Comp_Ex;CWKCF;dt=Can;ea=W;gc=Lo;Gr_Base;IDC;Ideo;IDS;lb=ID;NFC_QC=N;NFD_QC=N;NFKC_QC=N;NFKD_QC=N;SB=LE;sc=Hani;XIDC;XIDS
# F900..FAFF CJK Compatibility Ideographs
---- base_unidata/UnicodeData.txt 2014-03-09 16:20:00.000000000 -0700
-+++ UnicodeData.txt 2014-03-09 16:20:01.000000000 -0700
+@@ -27053,7 +27092,13 @@
+ cp;1F3EE;na=IZAKAYA LANTERN
+ cp;1F3EF;na=JAPANESE CASTLE
+ cp;1F3F0;na=EUROPEAN CASTLE
+-cp;1F3F1..1F3FF;age=NA;bc=L;gc=Cn;-Gr_Base;lb=XX;sc=Zzzz
++cp;1F3F1..1F3FA;age=NA;bc=L;gc=Cn;-Gr_Base;lb=XX;sc=Zzzz
++# Early add of emoji modifiers for Fizpatrick types, slated for Unicode 8.0
++cp;1F3FB;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-1-2
++cp;1F3FC;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-3
++cp;1F3FD;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-4
++cp;1F3FE;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-5
++cp;1F3FF;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-6
+ # Animal symbols
+ cp;1F400;na=RAT
+ cp;1F401;na=MOUSE
+--- base_unidata/UnicodeData.txt 2014-03-16 23:30:07.000000000 -0700
++++ UnicodeData.txt 2014-10-31 21:51:42.000000000 -0700
@@ -14443,7 +14443,65 @@
DC00;<Low Surrogate, First>;Cs;0;L;;;;;N;;;;;
DFFF;<Low Surrogate, Last>;Cs;0;L;;;;;N;;;;;
F900;CJK COMPATIBILITY IDEOGRAPH-F900;Lo;0;L;8C48;;;;N;;;;;
F901;CJK COMPATIBILITY IDEOGRAPH-F901;Lo;0;L;66F4;;;;N;;;;;
F902;CJK COMPATIBILITY IDEOGRAPH-F902;Lo;0;L;8ECA;;;;N;;;;;
---- norm2/base_norm2/nfc.txt 2014-03-08 23:07:17.000000000 -0800
-+++ norm2/nfc.txt 2014-03-09 16:14:50.000000000 -0700
+@@ -22937,6 +22995,11 @@
+ 1F3EE;IZAKAYA LANTERN;So;0;ON;;;;;N;;;;;
+ 1F3EF;JAPANESE CASTLE;So;0;ON;;;;;N;;;;;
+ 1F3F0;EUROPEAN CASTLE;So;0;ON;;;;;N;;;;;
++1F3FB;EMOJI MODIFIER FITZPATRICK TYPE-1-2;Sk;0;ON;;;;;N;;;;;
++1F3FC;EMOJI MODIFIER FITZPATRICK TYPE-3;Sk;0;ON;;;;;N;;;;;
++1F3FD;EMOJI MODIFIER FITZPATRICK TYPE-4;Sk;0;ON;;;;;N;;;;;
++1F3FE;EMOJI MODIFIER FITZPATRICK TYPE-5;Sk;0;ON;;;;;N;;;;;
++1F3FF;EMOJI MODIFIER FITZPATRICK TYPE-6;Sk;0;ON;;;;;N;;;;;
+ 1F400;RAT;So;0;ON;;;;;N;;;;;
+ 1F401;MOUSE;So;0;ON;;;;;N;;;;;
+ 1F402;OX;So;0;ON;;;;;N;;;;;
+--- norm2/base_norm2/nfc.txt 2014-03-16 23:30:07.000000000 -0700
++++ norm2/nfc.txt 2014-03-16 23:30:07.000000000 -0700
@@ -272,6 +272,8 @@
AAC1:230
AAF6:9
1F3A0..1F3C4 ; Grapheme_Base
1F3C6..1F3CA ; Grapheme_Base
1F3E0..1F3F0 ; Grapheme_Base
-1F400..1F43E ; Grapheme_Base
+1F3FB..1F43E ; Grapheme_Base
1F440 ; Grapheme_Base
1F442..1F4F7 ; Grapheme_Base
1F4F9..1F4FC ; Grapheme_Base
1F3EE;IZAKAYA LANTERN;So;0;ON;;;;;N;;;;;
1F3EF;JAPANESE CASTLE;So;0;ON;;;;;N;;;;;
1F3F0;EUROPEAN CASTLE;So;0;ON;;;;;N;;;;;
+1F3FB;EMOJI MODIFIER FITZPATRICK TYPE-1-2;Sk;0;ON;;;;;N;;;;;
+1F3FC;EMOJI MODIFIER FITZPATRICK TYPE-3;Sk;0;ON;;;;;N;;;;;
+1F3FD;EMOJI MODIFIER FITZPATRICK TYPE-4;Sk;0;ON;;;;;N;;;;;
+1F3FE;EMOJI MODIFIER FITZPATRICK TYPE-5;Sk;0;ON;;;;;N;;;;;
+1F3FF;EMOJI MODIFIER FITZPATRICK TYPE-6;Sk;0;ON;;;;;N;;;;;
1F400;RAT;So;0;ON;;;;;N;;;;;
1F401;MOUSE;So;0;ON;;;;;N;;;;;
1F402;OX;So;0;ON;;;;;N;;;;;
cp;1F3EE;na=IZAKAYA LANTERN
cp;1F3EF;na=JAPANESE CASTLE
cp;1F3F0;na=EUROPEAN CASTLE
-cp;1F3F1..1F3FF;age=NA;bc=L;gc=Cn;-Gr_Base;lb=XX;sc=Zzzz
+cp;1F3F1..1F3FA;age=NA;bc=L;gc=Cn;-Gr_Base;lb=XX;sc=Zzzz
+# Early add of emoji modifiers for Fizpatrick types, slated for Unicode 8.0
+cp;1F3FB;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-1-2
+cp;1F3FC;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-3
+cp;1F3FD;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-4
+cp;1F3FE;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-5
+cp;1F3FF;age=8.0;gc=Sk;na=EMOJI MODIFIER FITZPATRICK TYPE-6
# Animal symbols
cp;1F400;na=RAT
cp;1F401;na=MOUSE
int32_t
DecimalFormat::getGroupingSize() const
{
- return fGroupingSize;
+ return isGroupingUsed() ? fGroupingSize : 0;
}
//------------------------------------------------------------------------------
}
//----------------------------------------------------------------------
+#define IS_BIDI_MARK(c) (c==0x200E || c==0x200F || c==0x061C)
+
UBool SimpleDateFormat::matchLiterals(const UnicodeString &pattern,
int32_t &patternOffset,
const UnicodeString &text,
}
}
- literal += ch;
+ if (!IS_BIDI_MARK(ch)) {
+ literal += ch;
+ }
}
- // at this point, literal contains the literal text
+ // at this point, literal contains the pattern literal text (without bidi marks)
// and i is the index of the next non-literal pattern character.
int32_t p;
int32_t t = textOffset;
if (whitespaceLenient) {
- // trim leading, trailing whitespace from
- // the literal text
+ // trim leading, trailing whitespace from the pattern literal
literal.trim();
- // ignore any leading whitespace in the text
- while (t < text.length() && u_isWhitespace(text.charAt(t))) {
+ // ignore any leading whitespace (or bidi marks) in the text
+ while (t < text.length()) {
+ UChar ch = text.charAt(t);
+ if (!u_isWhitespace(ch) && !IS_BIDI_MARK(ch)) {
+ break;
+ }
t += 1;
}
}
for (p = 0; p < literal.length() && t < text.length();) {
UBool needWhitespace = FALSE;
+ // Skip any whitespace at current position in pattern,
+ // but remember whether we found whitespace in the pattern
+ // (we already deleted any bidi marks in the pattern).
while (p < literal.length() && PatternProps::isWhiteSpace(literal.charAt(p))) {
needWhitespace = TRUE;
p += 1;
}
+ // If the pattern has whitespace at this point, skip it in text as well
+ // (if the text does not have any, that may be an error for strict parsing)
if (needWhitespace) {
- int32_t tStart = t;
+ UBool whitespaceInText = FALSE;
+ // Skip any whitespace (or bidi marks) at current position in text,
+ // but remember whether we found whitespace in the text at this point.
while (t < text.length()) {
UChar tch = text.charAt(t);
-
- if (!u_isUWhiteSpace(tch) && !PatternProps::isWhiteSpace(tch)) {
+ if (u_isUWhiteSpace(tch) || PatternProps::isWhiteSpace(tch)) {
+ whitespaceInText = TRUE;
+ } else if (!IS_BIDI_MARK(tch)) {
break;
}
// TODO: should we require internal spaces
// in lenient mode? (There won't be any
// leading or trailing spaces)
- if (!whitespaceLenient && t == tStart) {
+ if (!whitespaceLenient && !whitespaceInText) {
// didn't find matching whitespace:
// an error in strict mode
return FALSE;
if (p >= literal.length()) {
break;
}
+ } else {
+ // Still need to skip any bidi marks in the text
+ while (t < text.length() && IS_BIDI_MARK(text.charAt(t))) {
+ ++t;
+ }
}
if (t >= text.length() || literal.charAt(p) != text.charAt(t)) {
for (t = textOffset; t < text.length(); t += 1) {
UChar ch = text.charAt(t);
- if (ignorables == NULL || !ignorables->contains(ch)) {
+ if (!IS_BIDI_MARK(ch) && (ignorables == NULL || !ignorables->contains(ch))) {
break;
}
}
static void TestUnicodeDefines(void);
+static void TestGetAppleParent(void);
+
void PrintDataTable();
/*---------------------------------------------------
TESTCASE(TestUnicodeDefines);
TESTCASE(TestEnglishExemplarCharacters);
TESTCASE(TestDisplayNameBrackets);
+ TESTCASE(TestGetAppleParent);
}
TEST_UNICODE_DEFINE(ULOC_KEYWORD_ASSIGN, ULOC_KEYWORD_ASSIGN_UNICODE);
TEST_UNICODE_DEFINE(ULOC_KEYWORD_ITEM_SEPARATOR, ULOC_KEYWORD_ITEM_SEPARATOR_UNICODE);
}
+
+/* Apple-specific, test for Apple-specific function ualoc_getAppleParent */
+static const char* localesAndAppleParent[] = {
+ "en", "root",
+ "en-US", "en",
+ "en-CA", "en",
+ "en-001", "en",
+ "en_001", "en",
+ "en-GB", "en_001",
+ "en_GB", "en_001",
+ "en-IN", "en_GB",
+ "en-AU", "en_GB",
+ "es", "root",
+ "es-ES", "es",
+ "es-419", "es",
+ "es_419", "es",
+ "es-MX", "es_419",
+ "es-AR", "es_419",
+ "fr", "root",
+ "fr-CA", "fr",
+ "fr-CH", "fr",
+ "haw", "root",
+ "nl", "root",
+ "nl-BE", "nl",
+ "pt", "root",
+ "pt-BR", "pt",
+ "pt-PT", "pt",
+ "pt-MO", "pt_PT",
+ "sr", "root",
+ "sr-Cyrl", "sr",
+ "sr-Latn", "root",
+ "tlh", "root",
+ "zh_CN", "root",
+ "zh-CN", "root",
+ "zh", "zh_CN",
+ "zh-Hans", "zh",
+ "zh_TW", "root",
+ "zh-TW", "root",
+ "zh-Hant", "zh_TW",
+ "zh_Hant", "zh_TW",
+ "zh-Hant-HK", "zh_Hant",
+ "zh_Hant_HK", "zh_Hant",
+ "zh-Hant-MO", "zh_Hant_HK",
+ "zh-Hans-HK", "zh_Hans",
+ "root", "root",
+ "en-Latn", "en",
+ "en-Latn-US", "en_Latn",
+ "en_US_POSIX", "en_US",
+ "en_Latn_US_POSIX", "en_Latn_US",
+ "en-u-ca-hebrew", "root",
+ "en@calendar=hebrew", "root",
+ "en_@calendar=hebrew", "root",
+ "en-", "root",
+ "en_", "root",
+ "Default@2x", "root",
+ "default", "root",
+ NULL /* terminator */
+};
+
+static void TestGetAppleParent() {
+ const char **localesPtr = localesAndAppleParent;
+ const char * locale;
+ while ((locale = *localesPtr++) != NULL) {
+ const char * expectParent = *localesPtr++;
+ UErrorCode status = U_ZERO_ERROR;
+ char getParent[ULOC_FULLNAME_CAPACITY];
+ int32_t plen = ualoc_getAppleParent(locale, getParent, ULOC_FULLNAME_CAPACITY, &status);
+ if (U_FAILURE(status)) {
+ log_err("FAIL: ualoc_getAppleParent input \"%s\", status %s\n", locale, u_errorName(status));
+ } else if (uprv_strcmp(expectParent, getParent) != 0) {
+ log_err("FAIL: ualoc_getAppleParent input \"%s\", expected parent \"%s\", got parent \"%s\"\n", locale, expectParent, getParent);
+ }
+ }
+}
return;
}
+ // bug 10864
+ status = U_ZERO_ERROR;
+ DecimalFormat noGrouping("###0.##", status);
+ if (noGrouping.getGroupingSize() != 0) {
+ errln("Grouping size should be 0 for no grouping.");
+ }
+ // end bug 10864
+
status = U_ZERO_ERROR;
const UnicodeString pattern("#,##0.# FF");
DecimalFormat pat(pattern, status);
ExpectedResult abbrevData[] = {
{t_1m_59_9996s, LENGTHOF(t_1m_59_9996s), "1 min, 59.9996 secs"},
- {t_19m, LENGTHOF(t_19m), "19 mins"},
+ {t_19m, LENGTHOF(t_19m), "19 min"},
{t_1h_23_5s, LENGTHOF(t_1h_23_5s), "1 hr, 23.5 secs"},
- {t_1h_23_5m, LENGTHOF(t_1h_23_5m), "1 hr, 23.5 mins"},
- {t_1h_0m_23s, LENGTHOF(t_1h_0m_23s), "1 hr, 0 mins, 23 secs"},
+ {t_1h_23_5m, LENGTHOF(t_1h_23_5m), "1 hr, 23.5 min"},
+ {t_1h_0m_23s, LENGTHOF(t_1h_0m_23s), "1 hr, 0 min, 23 secs"},
{t_2y_5M_3w_4d, LENGTHOF(t_2y_5M_3w_4d), "2 yrs, 5 mths, 3 wks, 4 days"}};
ExpectedResult narrowData[] = {
tp.srcLine = new UVector32(status);
tp.srcCol = new UVector32(status);
- RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
+ RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@=-]*) *>"), 0, status);
if (U_FAILURE(status)) {
dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
}
/***********************************************************************
* COPYRIGHT:
- * Copyright (c) 1997-2014, International Business Machines Corporation
+ * Copyright (c) 1997-2015, International Business Machines Corporation
* and others. All Rights Reserved.
***********************************************************************/
{"America/Virgin", "America/Anguilla"},
{"Antarctica/South_Pole", "Antarctica/McMurdo"},
{"Arctic/Longyearbyen", "Europe/Oslo"},
+ {"Asia/Kuwait", "Asia/Aden"},
+ {"Asia/Muscat", "Asia/Dubai"},
{"Asia/Phnom_Penh", "Asia/Bangkok"},
+ {"Asia/Qatar", "Asia/Bahrain"},
+ {"Asia/Riyadh", "Asia/Aden"},
{"Asia/Vientiane", "Asia/Bangkok"},
{"Atlantic/Jan_Mayen", "Europe/Oslo"},
{"Atlantic/St_Helena", "Africa/Abidjan"},
-# Copyright (c) 2001-2014 International Business Machines
+# Copyright (c) 2001-2015 International Business Machines
# Corporation and others. All Rights Reserved.
#
# RBBI Test Data
<data>•abc/\u05D9 •def•</data>
<data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data>
<data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA•</data>
+
+####################################################################################
+#
+# Test CSS line break variants: strict, normal, loose
+#
+####################################################################################
+
+<locale ja@lb=strict>
+<line>
+# •no brk before 3063 •no brk before 301C•no brk btw 2026 •no brk before FF01•
+<data>•\u3084\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>
+
+<locale ja@lb=normal>
+<line>
+# •brk OK before 3063 •brk OK before 301C •no brk btw 2026 •no brk before FF01•
+<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>
+
+<locale ja@lb=loose>
+<line>
+# •brk OK before 3063 •brk OK before 301C •brk OK btw 2026 •brk OK before FF01•
+<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020•</data>
+
+<locale en@lb=strict>
+<line>
+# •no brk before 3063 •no brk before 301C•no brk btw 2026 •no brk before FF01•
+<data>•\u3084\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>
+
+<locale en@lb=normal>
+<line>
+# •brk OK before 3063 •no brk before 301C •no brk btw 2026 •no brk before FF01•
+<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>
+
+<locale en@lb=loose>
+<line>
+# •brk OK before 3063 •no brk before 301C •brk OK btw 2026 •no brk before FF01•
+<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026•\u2026\u0020•u30A2\uFF01\u0020•</data>
+
+####################################################################################
+#
+# Test Apple breaks for emoji clusters (same for all locales and break types)
+#
+####################################################################################
+
+<locale root>
+
+<char>
+# woman zwj woman zwj girl zwj girl, woman/fitz-1-2 zwj woman/fitz-4 zwj boy/fitz-6
+<data>•\U0001F469\u200D\U0001F469\u200D\U0001F467\u200D\U0001F467•\U0001F469\U0001F3FB\u200D\U0001F469\U0001F3FD\u200D\U0001F466\U0001F3FF•</data>
+# woman zwj, baby/fitz-3, older_woman/fitz-5, runner/fitz-4, raised_fist/fitz-3, fuel_pump, fitz-3
+<data>•\U0001F469\u200D•\U0001F476\U0001F3FC•\U0001F475\U0001F3FE•\U0001F3C3\U0001F3FD•\u270A\U0001F3FC•\u26FD•\U0001F3FC•</data>
+# man zwj hvy_blk_heart zwj man, woman, man zwj hvy_blk_heart esel zwj man, woman
+<data>•\U0001F468\u200D\u2764\u200D\U0001F468•\U0001F469•\U0001F468\u200D\u2764\uFE0F\u200D\U0001F468•\U0001F469•</data>
+# woman zwj hvy_blk_heart/esel zwj kiss_mark zwj woman, man
+<data>•\U0001F469\u200D\u2764\uFE0F\u200D\U0001F48B\u200D\U0001F469•\U0001F468•</data>
+# victory_hand esel, victory_hand/esel/fitz-1-2, victory_hand/fitz-1-2, rowboat/fitz-4, vulcan_salute/fitz-5, space,
+<data>•\u270C\uFE0F•\u270C\uFE0F\U0001F3FB•\u270C\U0001F3FB•\U0001F6A3\U0001F3FD•\U0001F596\U0001F3FE•\u0020•</data>
+# flags1 AE AU AT BE BR CA CL
+<data>•\U0001F1E6\U0001F1EA•\U0001F1E6\U0001F1FA•\U0001F1E6\U0001F1F9•\U0001F1E7\U0001F1EA•\U0001F1E7\U0001F1F7•\U0001F1E8\U0001F1E6•\U0001F1E8\U0001F1F1•</data>
+# flags2 CN CO DK FI FR DE HK
+<data>•\U0001F1E8\U0001F1F3•\U0001F1E8\U0001F1F4•\U0001F1E9\U0001F1F0•\U0001F1EB\U0001F1EE•\U0001F1EB\U0001F1F7•\U0001F1E9\U0001F1EA•\U0001F1ED\U0001F1F0•</data>
+# flags3 IN ID IE IL IT JP KR
+<data>•\U0001F1EE\U0001F1F3•\U0001F1EE\U0001F1E9•\U0001F1EE\U0001F1EA•\U0001F1EE\U0001F1F1•\U0001F1EE\U0001F1F9•\U0001F1EF\U0001F1F5•\U0001F1F0\U0001F1F7•</data>
+
+<word>
+# woman zwj woman zwj girl zwj girl, woman/fitz-1-2 zwj woman/fitz-4 zwj boy/fitz-6
+<data>•\U0001F469\u200D\U0001F469\u200D\U0001F467\u200D\U0001F467•\U0001F469\U0001F3FB\u200D\U0001F469\U0001F3FD\u200D\U0001F466\U0001F3FF•</data>
+# woman zwj, baby/fitz-3, older_woman/fitz-5, runner/fitz-4, raised_fist/fitz-3, fuel_pump, fitz-3
+<data>•\U0001F469\u200D•\U0001F476\U0001F3FC•\U0001F475\U0001F3FE•\U0001F3C3\U0001F3FD•\u270A\U0001F3FC•\u26FD•\U0001F3FC•</data>
+# man zwj hvy_blk_heart zwj man, woman, man zwj hvy_blk_heart esel zwj man, woman
+<data>•\U0001F468\u200D\u2764\u200D\U0001F468•\U0001F469•\U0001F468\u200D\u2764\uFE0F\u200D\U0001F468•\U0001F469•</data>
+# woman zwj hvy_blk_heart esel zwj kiss mark zwj woman, man
+<data>•\U0001F469\u200D\u2764\uFE0F\u200D\U0001F48B\u200D\U0001F469•\U0001F468•</data>
+# victory_hand esel, victory_hand/esel/fitz-1-2, victory_hand/fitz-1-2, rowboat/fitz-4, vulcan_salute/fitz-5, space,
+<data>•\u270C\uFE0F•\u270C\uFE0F\U0001F3FB•\u270C\U0001F3FB•\U0001F6A3\U0001F3FD•\U0001F596\U0001F3FE•\u0020•</data>
+# flags1 AE AU AT BE BR CA CL
+<data>•\U0001F1E6\U0001F1EA•\U0001F1E6\U0001F1FA•\U0001F1E6\U0001F1F9•\U0001F1E7\U0001F1EA•\U0001F1E7\U0001F1F7•\U0001F1E8\U0001F1E6•\U0001F1E8\U0001F1F1•</data>
+# flags2 CN CO DK FI FR DE HK
+<data>•\U0001F1E8\U0001F1F3•\U0001F1E8\U0001F1F4•\U0001F1E9\U0001F1F0•\U0001F1EB\U0001F1EE•\U0001F1EB\U0001F1F7•\U0001F1E9\U0001F1EA•\U0001F1ED\U0001F1F0•</data>
+# flags3 IN ID IE IL IT JP KR
+<data>•\U0001F1EE\U0001F1F3•\U0001F1EE\U0001F1E9•\U0001F1EE\U0001F1EA•\U0001F1EE\U0001F1F1•\U0001F1EE\U0001F1F9•\U0001F1EF\U0001F1F5•\U0001F1F0\U0001F1F7•</data>
+
+<line>
+# woman zwj woman zwj girl zwj girl # (line, skip this for now, need safe rules and we don't generate it:) woman/fitz-1-2 zwj woman/fitz-4 zwj boy/fitz-6
+<data>•\U0001F469\u200D\U0001F469\u200D\U0001F467\u200D\U0001F467•</data>
+# woman zwj, baby/fitz-3, older_woman/fitz-5, runner/fitz-4, raised_fist/fitz-3, fuel_pump, fitz-3
+<data>•\U0001F469\u200D•\U0001F476\U0001F3FC•\U0001F475\U0001F3FE•\U0001F3C3\U0001F3FD•\u270A\U0001F3FC•\u26FD•\U0001F3FC•</data>
+# man zwj hvy_blk_heart zwj man, woman, man zwj hvy_blk_heart esel zwj man, woman
+<data>•\U0001F468\u200D\u2764\u200D\U0001F468•\U0001F469•\U0001F468\u200D\u2764\uFE0F\u200D\U0001F468•\U0001F469•</data>
+# woman zwj hvy_blk_heart esel zwj kiss mark zwj woman, man
+<data>•\U0001F469\u200D\u2764\uFE0F\u200D\U0001F48B\u200D\U0001F469•\U0001F468•</data>
+# victory_hand esel, victory_hand/esel/fitz-1-2, victory_hand/fitz-1-2, rowboat/fitz-4, vulcan_salute/fitz-5 space,
+<data>•\u270C\uFE0F•\u270C\uFE0F\U0001F3FB•\u270C\U0001F3FB•\U0001F6A3\U0001F3FD•\U0001F596\U0001F3FE\u0020•</data>
+# no special flags handling for line
+
+<locale ja@lb=loose>
+<line>
+# woman zwj woman zwj girl zwj girl # (line, skip this for now, need safe rules and we don't generate it:) woman/fitz-1-2 zwj woman/fitz-4 zwj boy/fitz-6
+<data>•\U0001F469\u200D\U0001F469\u200D\U0001F467\u200D\U0001F467•</data>
+# woman zwj, baby/fitz-3, older_woman/fitz-5, runner/fitz-4, raised_fist/fitz-3, fuel_pump, fitz-3
+<data>•\U0001F469\u200D•\U0001F476\U0001F3FC•\U0001F475\U0001F3FE•\U0001F3C3\U0001F3FD•\u270A\U0001F3FC•\u26FD•\U0001F3FC•</data>
+# man zwj hvy_blk_heart zwj man, woman, man zwj hvy_blk_heart esel zwj man, woman
+<data>•\U0001F468\u200D\u2764\u200D\U0001F468•\U0001F469•\U0001F468\u200D\u2764\uFE0F\u200D\U0001F468•\U0001F469•</data>
+# woman zwj hvy_blk_heart esel zwj kiss mark zwj woman, man
+<data>•\U0001F469\u200D\u2764\uFE0F\u200D\U0001F48B\u200D\U0001F469•\U0001F468•</data>
+# victory_hand esel, victory_hand/esel/fitz-1-2, victory_hand/fitz-1-2, rowboat/fitz-4, vulcan_salute/fitz-5 space,
+<data>•\u270C\uFE0F•\u270C\uFE0F\U0001F3FB•\u270C\U0001F3FB•\U0001F6A3\U0001F3FD•\U0001F596\U0001F3FE\u0020•</data>
+# no special flags handling for line
endif
# even for a crossbuild host build, we want to use the target's latest tzdata as pointed to by latest_tzdata.tar.gz
-export TZDATA:=$(SDKPATH)/usr/local/share/tz/$(shell readlink $(SDKPATH)/usr/local/share/tz/latest_tzdata.tar.gz)
+ifeq "$(shell test -d $(SDKPATH)/usr/local/share/tz && echo YES )" "YES"
+ export TZDATA:=$(SDKPATH)/usr/local/share/tz/$(shell readlink $(SDKPATH)/usr/local/share/tz/latest_tzdata.tar.gz)
+endif
$(info # TZDATA=$(TZDATA))
ifeq "$(WINDOWS)" "YES"