From ee6c04ef0082bcd8ca62240dc8278f22d8372cf1 Mon Sep 17 00:00:00 2001 From: "Jay Freeman (saurik)" Date: Wed, 25 Nov 2015 05:54:12 -0800 Subject: [PATCH] Support most of the Unicode stuff in ECMAScript 6. --- .gitignore | 2 + Cycript.l.in | 132 ++++++++++++++++++++++++++++++++++++++------ Filter.sh | 64 +++++++++++++-------- Library.cpp | 33 +++++++++-- Makefile.am | 6 +- Makefile.in | 6 +- UnicodeIDContinue.l | 10 ++++ UnicodeIDStart.l | 9 +++ backtrack.sh | 3 + unicode.mk | 38 +++++++++++++ unicode.py | 115 ++++++++++++++++++++++++++++++++++++++ unicode.sh | 32 +++++++++++ 12 files changed, 400 insertions(+), 50 deletions(-) create mode 100644 UnicodeIDContinue.l create mode 100644 UnicodeIDStart.l create mode 100755 backtrack.sh create mode 100644 unicode.mk create mode 100755 unicode.py create mode 100755 unicode.sh diff --git a/.gitignore b/.gitignore index dd1e7c5..6d37fe1 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,5 @@ build.* Cycript.ios Cycript.osx Cycript.lib +DerivedCoreProperties.txt +PropList.txt diff --git a/Cycript.l.in b/Cycript.l.in index 081980e..d54ba0a 100644 --- a/Cycript.l.in +++ b/Cycript.l.in @@ -116,6 +116,46 @@ int H(char c) { return -1; } +static void U(char *&local, unsigned point) { + if (false) { + } else if (point < 0x000080) { + *local++ = point; + } else if (point < 0x000800) { + *local++ = 0xc0 | point >> 0x06 & 0x1f; + goto one; + } else if (point < 0x010000) { + *local++ = 0xe0 | point >> 0x0c & 0x0f; + goto two; + } else if (point < 0x110000) { + *local++ = 0xf0 | point >> 0x12 & 0x07; + *local++ = 0x80 | point >> 0x0c & 0x3f; + two: + *local++ = 0x80 | point >> 0x06 & 0x3f; + one: + *local++ = 0x80 | point >> 0x00 & 0x3f; + } else _assert(false); +} + +static void U(char *&local, const char *text, yy_size_t &i) { + unsigned point; + + char next(text[++i]); + if (next != '{') { + point = H(text[i + 0]) << 12 | H(text[i + 1]) << 8 | H(text[i + 2]) << 4 | H(text[i + 3]); + i += 3; + } else { + point = 0; + for (;;) { + next = text[++i]; + if (next == '}') + break; + point = (point << 4) | H(next); + } + } + + U(local, point); +} + #define YY_INPUT(data, value, size) { \ if (yyextra->data_.eof()) \ value = YY_NULL; \ @@ -146,18 +186,45 @@ int H(char c) { %option full %option ecs -%option align -Escape \\[\\'"bfnrtv]|\\[0-7]|\\[4-7][0-7]|\\[0-3][0-7][0-7]?|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\\n +U1 [\x00-\x7f] +U0 [\x80-\xbf] +U2 [\xc2-\xdf] +U3 [\xe0-\xef] +U4 [\xf0-\xf4] -IdentifierStart [a-zA-Z$_] -IdentifierPart [a-zA-Z$_0-9] +HexDigit [0-9a-fA-F] +LineTerminatorSequence \r?\n|\r|\xe2\x80[\xa8\xa9] +WhiteSpace [\x09\x0b\x0c\x20]|\xc2\xa0|\xef\xbb\xbf +UnicodeEscape \\u({HexDigit}{4}|\{{HexDigit}+\}) + +OctalEscape \\[1-7]|\\[4-7][0-7]|\\[0-3][0-7][0-7]? +StringEscape \\['"\\bfnrtv]|\\0|{OctalEscape}|\\x{HexDigit}{2}|{UnicodeEscape} +StringExtra {StringEscape}|\\{LineTerminatorSequence} +SingleString ([^'\\\n]|{StringExtra})* +DoubleString ([^"\\\n]|{StringExtra})* +StringPrefix '{SingleString}|\"{DoubleString} + +@include UnicodeIDStart.l +@include UnicodeIDContinue.l + +IdentifierMore [$_] + +UnicodeStart {IdentifierMore}|{UnicodeIDStart} +UnicodePart {IdentifierMore}|\xe2\x80[\x8c\x8d]|{UnicodeIDContinue} +UnicodeFail {U2}|{U3}|{U3}{U0}|{U4}|{U4}{U0}|{U4}{U0}{U0} +UnicodeScrap {UnicodePart}*{UnicodeFail}? + +IdentifierStart {UnicodeStart}|{UnicodeEscape} +IdentifierPart {UnicodePart}|{UnicodeEscape} +IdentifierFail {UnicodeFail}|\\(u({HexDigit}{0,3}|\{{HexDigit}*))? +IdentifierScrap {IdentifierPart}*{IdentifierFail}? NonTerminator [^\n] BackslashSequence \\{NonTerminator} RegularExpressionFirstChar [^\n*\\/]|{BackslashSequence} RegularExpressionChar [^\n\\/]|{BackslashSequence} -RegularExpressionFlags {IdentifierPart}* +RegularExpressionFlags {UnicodePart}* RegularExpressionChars {RegularExpressionChar}* RegularExpressionBody {RegularExpressionFirstChar}{RegularExpressionChars} @@ -178,6 +245,7 @@ XMLName {XMLNameStart}{XMLNamePart}* %% \/{RegularExpressionBody}\/{RegularExpressionFlags} L C I(literal, RegEx(Y), tk::RegularExpressionLiteral, hi::Constant); +\/{RegularExpressionBody}\/{RegularExpressionFlags}{UnicodeFail} L E("invalid flags") \/{RegularExpressionBody}?\\? L E("unterminated regex") #![^\n]* L M @@ -401,7 +469,25 @@ XMLName {XMLNameStart}{XMLNamePart}* "xml" L C I(identifier, Identifier("xml"), tk::XML, hi::Meta); @end -{IdentifierStart}{IdentifierPart}* L C I(identifier, Identifier(Y), tk::Identifier_, hi::Identifier); +{UnicodeStart}{UnicodePart}* L C I(identifier, Identifier(Y), tk::Identifier_, hi::Identifier); + +{IdentifierStart}{IdentifierPart}* L C { + char *value(A char[yyleng + 1]); + char *local(value); + + for (yy_size_t i(0), e(yyleng); i != e; ++i) { + char next(yytext[i]); + if (next != '\\') + *local++ = next; + else + U(local, yytext, ++i); + } + + *local = '\0'; + I(identifier, Identifier(value), tk::Identifier_, hi::Identifier); +} + +({IdentifierStart}{IdentifierPart}*)?{IdentifierFail} L E("invalid identifier") 0[0-7]+ L C I(number, Number(strtoull(yytext + 1, NULL, 8)), tk::NumericLiteral, hi::Constant); 0[0-9]+ L C I(number, Number(strtoull(yytext + 1, NULL, 10)), tk::NumericLiteral, hi::Constant); @@ -411,10 +497,10 @@ XMLName {XMLNameStart}{XMLNamePart}* 0[bB][0-1]+ L C I(number, Number(strtoull(yytext + 2, NULL, 2)), tk::NumericLiteral, hi::Constant); (\.[0-9]+|(0|[1-9][0-9]*)(\.[0-9]*)?)([eE][+-]?[0-9]+)? L C I(number, Number(strtod(yytext, NULL)), tk::NumericLiteral, hi::Constant); -(\.[0-9]+|(0|[1-9][0-9]*)(\.[0-9]*)?)[eE][+-]?{IdentifierPart}* L E("invalid exponent") -(\.?[0-9]|(0|[1-9][0-9]*)\.){IdentifierPart}* L E("invalid number") +(\.[0-9]+|(0|[1-9][0-9]*)(\.[0-9]*)?)[eE][+-]?{IdentifierScrap} L E("invalid exponent") +(\.?[0-9]|(0|[1-9][0-9]*)\.){IdentifierScrap} L E("invalid number") -\"([^"\\\n]|{Escape})*\"|'([^'\\\n]|{Escape})*' L C { +'{SingleString}'|\"{DoubleString}\" L C { char *value(A char[yyleng]); char *local(value); @@ -422,8 +508,13 @@ XMLName {XMLNameStart}{XMLNamePart}* char next(yytext[i]); if (yytext[i] == '\\') - switch (next = yytext[++i]) { - case '\n': continue; + // XXX: support more line continuation characters + if (false) line: { + yylloc->end.lines(1); + yylloc->end.columns(yyleng - i); + } else switch (next = yytext[++i]) { + case '\n': goto line; + case '\\': next = '\\'; break; case '\'': next = '\''; break; case '"': next = '"'; break; @@ -451,9 +542,13 @@ XMLName {XMLNameStart}{XMLNamePart}* break; case 'x': - next = H(yytext[i + 1]) << 4 | H(yytext[i + 2]); + U(local, H(yytext[i + 1]) << 4 | H(yytext[i + 2])); i += 2; - break; + continue; + + case 'u': + U(local, yytext, i); + continue; } *local++ = next; @@ -463,15 +558,18 @@ XMLName {XMLNameStart}{XMLNamePart}* I(string, String(value, local - value), tk::StringLiteral, hi::Constant); } -(\"([^"\\\n]|{Escape})*|'([^'\\\n]|{Escape})*)(\\(x.{0,2}|u.{0,4})?)? L E("invalid escape") +{StringPrefix}\\(x.{0,2}|u([^{].{0,3}|\{[^}]*)?|{UnicodeFail})? L E("invalid escape") +{StringPrefix} L E("invalid string") -\r?\n|\r|\xe2\x80[\xa8\xa9] yylloc->step(); yylloc->end.lines(); N +{LineTerminatorSequence} yylloc->step(); yylloc->end.lines(); N -[ \t] L +{WhiteSpace} L <> if (yyextra->auto_) { yyextra->auto_ = false; F(tk::AutoComplete, hi::Nothing); } L yyterminate(); -@{IdentifierStart}{IdentifierPart}*|\xe2.|. L E("unknown token") +@({UnicodeStart}{UnicodeScrap}|{UnicodeFail}) L E("invalid keyword") + +. L E("invalid character") %% diff --git a/Filter.sh b/Filter.sh index 6d8a4c5..35a1542 100755 --- a/Filter.sh +++ b/Filter.sh @@ -19,29 +19,49 @@ # along with this program. If not, see . # }}} +file=$1 +shift + filters=("$@") -while IFS= read -r line; do - if [[ ${line} = @if* ]]; then - line=${line#@if } - for name in "${filters[@]}"; do - if [[ ${line} = ${name}' '* ]]; then - echo "${line#${name} }" - fi - done - elif [[ ${line} = @begin* ]]; then - set ${line}; shift - filter= - for name in "${filters[@]}"; do - for side in "$@"; do - if [[ ${name} == ${side} ]]; then - unset filter +function include() { + file=$1 + shift + + dir=/${file} + dir=${dir%/*} + dir=${dir:-/.} + dir=${dir#/} + dir=${dir}/ + + while IFS= read -r line; do + if false; then : + elif [[ ${line} = @if* ]]; then + line=${line#@if } + for name in "${filters[@]}"; do + if [[ ${line} = ${name}' '* ]]; then + echo "${line#${name} }" fi done - done - elif [[ ${line} = @end ]]; then - unset filter - elif [[ -z ${filter+@} ]]; then - echo "${line}" - fi -done + elif [[ ${line} = @begin* ]]; then + set ${line}; shift + filter= + for name in "${filters[@]}"; do + for side in "$@"; do + if [[ ${name} == ${side} ]]; then + unset filter + fi + done + done + elif [[ ${line} = @end ]]; then + unset filter + elif [[ ${line} = @include* ]]; then + line=${line#@include } + include "${dir}${line}" + elif [[ -z ${filter+@} ]]; then + echo "${line}" + fi + done <"${file}" +} + +include "${file}" diff --git a/Library.cpp b/Library.cpp index afa705c..5c5c97d 100644 --- a/Library.cpp +++ b/Library.cpp @@ -120,7 +120,7 @@ void CYStringify(std::ostringstream &str, const char *data, size_t size) { str << (single ? '\'' : '"'); for (const char *value(data), *end(data + size); value != end; ++value) - switch (*value) { + switch (uint8_t next = *value) { case '\\': str << "\\\\"; break; case '\b': str << "\\b"; break; case '\f': str << "\\f"; break; @@ -141,12 +141,35 @@ void CYStringify(std::ostringstream &str, const char *data, size_t size) { else goto simple; break; + case '\0': + if (value[1] >= '0' && value[1] <= '9') + str << "\\x00"; + else + str << "\\0"; + break; + default: - // this test is designed to be "awesome", generating neither warnings nor incorrect results - if (*value < 0x20 || *value >= 0x7f) - str << "\\x" << std::setbase(16) << std::setw(2) << std::setfill('0') << unsigned(uint8_t(*value)); - else simple: + if (next >= 0x20 && next < 0x7f) simple: str << *value; + else { + unsigned levels(1); + if ((next & 0x80) != 0) + while ((next & 0x80 >> ++levels) != 0); + + unsigned point(next & 0xff >> levels); + while (--levels != 0) + point = point << 6 | uint8_t(*++value) & 0x3f; + + if (point < 0x100) + str << "\\x" << std::setbase(16) << std::setw(2) << std::setfill('0') << point; + else if (point < 0x10000) + str << "\\u" << std::setbase(16) << std::setw(4) << std::setfill('0') << point; + else { + point -= 0x10000; + str << "\\u" << std::setbase(16) << std::setw(4) << std::setfill('0') << (0xd800 | point >> 0x0a); + str << "\\u" << std::setbase(16) << std::setw(4) << std::setfill('0') << (0xdc00 | point & 0x3ff); + } + } } str << (single ? '\'' : '"'); diff --git a/Makefile.am b/Makefile.am index 76f44b2..8be16df 100644 --- a/Makefile.am +++ b/Makefile.am @@ -91,11 +91,11 @@ endif CLEANFILES += Cycript.yy Cycript.yy: Cycript.yy.in - $(srcdir)/Filter.sh <$< >$@ $(filters) + $(srcdir)/Filter.sh $< >$@ $(filters) CLEANFILES += Cycript.l -Cycript.l: Cycript.l.in - $(srcdir)/Filter.sh <$< >$@ $(filters) +Cycript.l: Cycript.l.in UnicodeIDStart.l UnicodeIDContinue.l + $(srcdir)/Filter.sh $< >$@ $(filters) CLEANFILES += lex.cy.cpp lex.cy.cpp: Cycript.l diff --git a/Makefile.in b/Makefile.in index 1c86879..d380f6c 100644 --- a/Makefile.in +++ b/Makefile.in @@ -1318,9 +1318,9 @@ uninstall-am: uninstall-binPROGRAMS uninstall-libLTLIBRARIES @CY_EXECUTE_TRUE@Bridge.hpp: Bridge.gperf @CY_EXECUTE_TRUE@ $(GPERF) $< | $(SED) -e 's/defined __GNUC_STDC_INLINE__ || defined __GNUC_GNU_INLINE__/0/' >$@ Cycript.yy: Cycript.yy.in - $(srcdir)/Filter.sh <$< >$@ $(filters) -Cycript.l: Cycript.l.in - $(srcdir)/Filter.sh <$< >$@ $(filters) + $(srcdir)/Filter.sh $< >$@ $(filters) +Cycript.l: Cycript.l.in UnicodeIDStart.l UnicodeIDContinue.l + $(srcdir)/Filter.sh $< >$@ $(filters) lex.cy.cpp: Cycript.l $(FLEX) -b -t $< | $(SED) -e 's/int yyl;/yy_size_t yyl;/;s/int yyleng_r;/yy_size_t yyleng_r;/;s/yyg =/yyg __attribute__((__unused__)) =/' >$@ grep -F 'No backing up.' lex.backup >/dev/null diff --git a/UnicodeIDContinue.l b/UnicodeIDContinue.l new file mode 100644 index 0000000..e65b509 --- /dev/null +++ b/UnicodeIDContinue.l @@ -0,0 +1,10 @@ +UnicodeIDContinue_0 [\x30-\x39\x41-\x5a\x5f\x61-\x7a]|\xc2[\xaa\xba\xb5\xb7]|\xc3[\x80-\x96\x98-\xb6\xb8-\xbf]|\xcb[\x80\x81\xa2\xa3\xa1\x86-\x91\xac\xae\xa0\xa4]|\xcd[\x80-\xb4\xb6\xb7\xba-\xbd\xbf]|\xce[\x86-\x8a\x8c\x8e-\xa1\xa3-\xbf]|\xcf[\x80-\xb5\xb7-\xbf]|\xd2[\x80\x81\x83-\x87\x8a-\xbf]|\xd4[\x80-\xaf\xb1-\xbf]|\xd5[\x80-\x96\x99\xa1-\xbf]|\xd6[\x80-\x87\x91-\xbd\xbf]|\xd7[\x81\x82\x84\x85\x87\x90-\xaa\xb0-\xb2]|\xd8[\x90-\x9a\xa0-\xbf]|\xd9[\x80-\xa9\xae-\xbf]|\xdb[\x80-\x93\x95-\x9c\x9f-\xa8\xaa-\xbc\xbf]|\xdc[\x90-\xbf]|\xdd[\x80-\x8a\x8d-\xbf]|\xde[\x80-\xb1]|\xdf[\x80-\xb5\xba]|[\xc4-\xca\xcc\xd0\xd1\xd3\xda][\x80-\xbf]|\xe0\xa0[\x80-\xad]|\xe0\xa1[\x80-\x9b]|\xe0\xa2[\xa0-\xb4]|\xe0\xa3[\xa3-\xbf]|\xe0\xa5[\x80-\xa3\xa6-\xaf\xb1-\xbf]|\xe0\xa6[\x80-\x83\x85-\x8c\x8f\x90\x93-\xa8\xaa-\xb0\xb2\xb6-\xb9\xbc-\xbf]|\xe0\xa7[\x80-\x84\x87\x88\x8b-\x8e\x97\x9c\x9d\x9f-\xa3\xa6-\xb1]|\xe0\xa8[\x81-\x83\x85-\x8a\x8f\x90\x93-\xa8\xaa-\xb0\xb2\xb3\xb5\xb6\xb8\xb9\xbc\xbe\xbf]|\xe0\xa9[\x80-\x82\x87\x88\x8b-\x8d\x91\x99-\x9c\x9e\xa6-\xb5] +UnicodeIDContinue_1 \xe0\xaa[\x81-\x83\x85-\x8d\x8f-\x91\x93-\xa8\xaa-\xb0\xb2\xb3\xb5-\xb9\xbc-\xbf]|\xe0\xab[\x80-\x85\x87-\x89\x8b-\x8d\x90\xa0-\xa3\xa6-\xaf\xb9]|\xe0\xac[\x81-\x83\x85-\x8c\x8f\x90\x93-\xa8\xaa-\xb0\xb2\xb3\xb5-\xb9\xbc-\xbf]|\xe0\xad[\x80-\x84\x87\x88\x8b-\x8d\x96\x97\x9c\x9d\x9f-\xa3\xa6-\xaf\xb1]|\xe0\xae[\x82\x83\x85-\x8a\x8e-\x90\x92-\x95\x99\x9a\x9c\x9e\x9f\xa3\xa4\xa8-\xaa\xae-\xb9\xbe\xbf]|\xe0\xaf[\x80-\x82\x86-\x88\x8a-\x8d\x90\x97\xa6-\xaf]|\xe0\xb0[\x80-\x83\x85-\x8c\x8e-\x90\x92-\xa8\xaa-\xb9\xbd-\xbf]|\xe0\xb1[\x80-\x84\x86-\x88\x8a-\x8d\x95\x96\x98-\x9a\xa0-\xa3\xa6-\xaf]|\xe0\xb2[\x81-\x83\x85-\x8c\x8e-\x90\x92-\xa8\xaa-\xb3\xb5-\xb9\xbc-\xbf]|\xe0\xb3[\x80-\x84\x86-\x88\x8a-\x8d\x95\x96\x9e\xa0-\xa3\xa6-\xaf\xb1\xb2]|\xe0\xb4[\x81-\x83\x85-\x8c\x8e-\x90\x92-\xba\xbd-\xbf]|\xe0\xb5[\x80-\x84\x86-\x88\x8a-\x8e\x97\x9f-\xa3\xa6-\xaf\xba-\xbf]|\xe0\xb6[\x82\x83\x85-\x96\x9a-\xb1\xb3-\xbb\xbd]|\xe0\xb7[\x80-\x86\x8a\x8f-\x94\x96\x98-\x9f\xa6-\xaf\xb2\xb3]|\xe0\xb8[\x81-\xba] +UnicodeIDContinue_2 \xe0\xb9[\x80-\x8e\x90-\x99]|\xe0\xba[\x81\x82\x84\x87\x88\x8a\x8d\x94-\x97\x99-\x9f\xa1-\xa3\xa5\xa7\xaa\xab\xad-\xb9\xbb-\xbd]|\xe0\xbb[\x80-\x84\x86\x88-\x8d\x90-\x99\x9c-\x9f]|\xe0\xbc[\x80\xa0\xa2-\xa6\xa1\xa8\xa9\xa7\xbe\xb5\xb7\x98\x99\xb9\xbf]|\xe0\xbd[\x80-\x87\x89-\xac\xb1-\xbf]|\xe0\xbe[\x80-\x84\x86-\x97\x99-\xbc]|\xe0\xbf[\x86]|\xe0[\xa4][\x80-\xbf]|\xe1\x81[\x80-\x89\x90-\xbf]|\xe1\x82[\x80-\x9d\xa0-\xbf]|\xe1\x83[\x80-\x85\x87\x8d\x90-\xba\xbc-\xbf]|\xe1\x89[\x80-\x88\x8a-\x8d\x90-\x96\x98\x9a-\x9d\xa0-\xbf]|\xe1\x8a[\x80-\x88\x8a-\x8d\x90-\xb0\xb2-\xb5\xb8-\xbe]|\xe1\x8b[\x80\x82-\x85\x88-\x96\x98-\xbf]|\xe1\x8c[\x80-\x90\x92-\x95\x98-\xbf]|\xe1\x8d[\x80-\x9a\x9d-\x9f\xa9-\xb1]|\xe1\x8e[\x80-\x8f\xa0-\xbf]|\xe1\x8f[\x80-\xb5\xb8-\xbd]|\xe1\x90[\x81-\xbf]|\xe1\x99[\x80-\xac\xaf-\xbf]|\xe1\x9a[\x81-\x9a\xa0-\xbf]|\xe1\x9b[\x80-\xaa\xae-\xb8]|\xe1\x9c[\x80-\x8c\x8e-\x94\xa0-\xb4]|\xe1\x9d[\x80-\x93\xa0-\xac\xae-\xb0\xb2\xb3]|\xe1\x9f[\x80-\x93\x97\x9c\x9d\xa0-\xa9]|\xe1\xa0[\x8b-\x8d\x90-\x99\xa0-\xbf] +UnicodeIDContinue_3 \xe1\xa1[\x80-\xb7]|\xe1\xa2[\x80-\xaa\xb0-\xbf]|\xe1\xa3[\x80-\xb5]|\xe1\xa4[\x80-\x9e\xa0-\xab\xb0-\xbb]|\xe1\xa5[\x86-\xad\xb0-\xb4]|\xe1\xa6[\x80-\xab\xb0-\xbf]|\xe1\xa7[\x80-\x89\x90-\x9a]|\xe1\xa8[\x80-\x9b\xa0-\xbf]|\xe1\xa9[\x80-\x9e\xa0-\xbc\xbf]|\xe1\xaa[\x80-\x89\x90-\x99\xa7\xb0-\xbd]|\xe1\xad[\x80-\x8b\x90-\x99\xab-\xb3]|\xe1\xaf[\x80-\xb3]|\xe1\xb0[\x80-\xb7]|\xe1\xb1[\x80-\x89\x8d-\xbd]|\xe1\xb3[\x90-\x92\x94-\xb6\xb8\xb9]|\xe1\xb7[\x80-\xb5\xbc-\xbf]|\xe1\xbc[\x80-\x95\x98-\x9d\xa0-\xbf]|\xe1\xbd[\x80-\x85\x88-\x8d\x90-\x97\x99\x9b\x9d\x9f-\xbd]|\xe1\xbe[\x80-\xb4\xb6-\xbc\xbe]|\xe1\xbf[\x82-\x84\x86-\x8c\x90-\x93\x96-\x9b\xa0-\xac\xb2-\xb4\xb6-\xbc]|\xe1[\x80\x84-\x88\x91-\x98\x9e\xac\xae\xb4-\xb6\xb8-\xbb][\x80-\xbf]|\xe2\x80[\xbf]|\xe2\x81[\x80\xb1\x94\xbf]|\xe2\x82[\x90-\x9c]|\xe2\x83[\x90-\x9c\xa1\xa5-\xb0]|\xe2\x84[\x82\x87\x8a-\x93\x95\x98-\x9d\xa4\xa6\xa8\xaa-\xb9\xbc-\xbf]|\xe2\x85[\x85-\x89\x8e\xa0-\xbf]|\xe2\x86[\x80-\x88]|\xe2\xb0[\x80-\xae\xb0-\xbf]|\xe2\xb1[\x80-\x9e\xa0-\xbf] +UnicodeIDContinue_4 \xe2\xb3[\x80-\xa4\xab-\xb3]|\xe2\xb4[\x80-\xa5\xa7\xad\xb0-\xbf]|\xe2\xb5[\x80-\xa7\xaf\xbf]|\xe2\xb6[\x80-\x96\xa0-\xa6\xa8-\xae\xb0-\xb6\xb8-\xbe]|\xe2\xb7[\x80-\x86\x88-\x8e\x90-\x96\x98-\x9e\xa0-\xbf]|\xe2[\xb2][\x80-\xbf]|\xe3\x80[\x85-\x87\xa1-\xaf\xb1-\xb5\xb8-\xbc]|\xe3\x81[\x81-\xbf]|\xe3\x82[\x80-\x96\x99-\x9f\xa1-\xbf]|\xe3\x83[\x80-\xba\xbc-\xbf]|\xe3\x84[\x85-\xad\xb1-\xbf]|\xe3\x86[\x80-\x8e\xa0-\xba]|\xe3\x87[\xb0-\xbf]|\xe3[\x85\x90-\xbf][\x80-\xbf]|\xe4\xb6[\x80-\xb5]|\xe4[\x80-\xb5\xb8-\xbf][\x80-\xbf]|\xe9\xbf[\x80-\x95]|\xe9[\x80-\xbe][\x80-\xbf]|\xea\x92[\x80-\x8c]|\xea\x93[\x90-\xbd]|\xea\x98[\x80-\x8c\x90-\xab]|\xea\x99[\x80-\xaf\xb4-\xbd\xbf]|\xea\x9b[\x80-\xb1]|\xea\x9c[\x97-\x9f\xa2-\xbf]|\xea\x9e[\x80-\x88\x8b-\xad\xb0-\xb7]|\xea\x9f[\xb7-\xbf]|\xea\xa0[\x80-\xa7]|\xea\xa1[\x80-\xb3]|\xea\xa3[\x80-\x84\x90-\x99\xa0-\xb7\xbb\xbd]|\xea\xa4[\x80-\xad\xb0-\xbf]|\xea\xa5[\x80-\x93\xa0-\xbc]|\xea\xa7[\x80\x8f-\x99\xa0-\xbe]|\xea\xa8[\x80-\xb6]|\xea\xa9[\x80-\x8d\x90-\x99\xa0-\xb6\xba-\xbf] +UnicodeIDContinue_5 \xea\xab[\x80-\x82\x9b-\x9d\xa0-\xaf\xb2-\xb6]|\xea\xac[\x81-\x86\x89-\x8e\x91-\x96\xa0-\xa6\xa8-\xae\xb0-\xbf]|\xea\xad[\x80-\x9a\x9c-\xa5\xb0-\xbf]|\xea\xaf[\x80-\xaa\xac\xad\xb0-\xb9]|\xea[\x80-\x91\x94-\x97\x9a\x9d\xa2\xa6\xaa\xae\xb0-\xbf][\x80-\xbf]|\xed\x9e[\x80-\xa3\xb0-\xbf]|\xed\x9f[\x80-\x86\x8b-\xbb]|\xed[\x80-\x9d][\x80-\xbf]|\xef\xa9[\x80-\xad\xb0-\xbf]|\xef\xab[\x80-\x99]|\xef\xac[\x80-\x86\x93-\x97\x9d-\xa8\xaa-\xb6\xb8-\xbc\xbe]|\xef\xad[\x80\x81\x83\x84\x86-\xbf]|\xef\xae[\x80-\xb1]|\xef\xaf[\x93-\xbf]|\xef\xb4[\x80-\xbd]|\xef\xb5[\x90-\xbf]|\xef\xb6[\x80-\x8f\x92-\xbf]|\xef\xb7[\x80-\x87\xb0-\xbb]|\xef\xb8[\x80-\x8f\xa0-\xaf\xb3\xb4]|\xef\xb9[\x8d-\x8f\xb0-\xb4\xb6-\xbf]|\xef\xbb[\x80-\xbc]|\xef\xbc[\x90-\x99\xa1-\xba\xbf]|\xef\xbd[\x81-\x9a\xa6-\xbf]|\xef\xbe[\x80-\xbe]|\xef\xbf[\x82-\x87\x8a-\x8f\x92-\x97\x9a-\x9c]|\xef[\xa4-\xa8\xaa\xb0-\xb3\xba][\x80-\xbf]|[\xe5-\xe8\xeb\xec][\x80-\xbf][\x80-\xbf]|\xf0\x90\x80[\x80-\x8b\x8d-\xa6\xa8-\xba\xbc\xbd\xbf]|\xf0\x90\x81[\x80-\x8d\x90-\x9d] +UnicodeIDContinue_6 \xf0\x90\x83[\x80-\xba]|\xf0\x90\x85[\x80-\xb4]|\xf0\x90\x87[\xbd]|\xf0\x90\x8a[\x80-\x9c\xa0-\xbf]|\xf0\x90\x8b[\x80-\x90\xa0]|\xf0\x90\x8c[\x80-\x9f\xb0-\xbf]|\xf0\x90\x8d[\x80-\x8a\x90-\xba]|\xf0\x90\x8e[\x80-\x9d\xa0-\xbf]|\xf0\x90\x8f[\x80-\x83\x88-\x8f\x91-\x95]|\xf0\x90\x92[\x80-\x9d\xa0-\xa9]|\xf0\x90\x94[\x80-\xa7\xb0-\xbf]|\xf0\x90\x95[\x80-\xa3]|\xf0\x90\x9c[\x80-\xb6]|\xf0\x90\x9d[\x80-\x95\xa0-\xa7]|\xf0\x90\xa0[\x80-\x85\x88\x8a-\xb5\xb7\xb8\xbc\xbf]|\xf0\x90\xa1[\x80-\x95\xa0-\xb6]|\xf0\x90\xa2[\x80-\x9e]|\xf0\x90\xa3[\xa0-\xb2\xb4\xb5]|\xf0\x90\xa4[\x80-\x95\xa0-\xb9]|\xf0\x90\xa6[\x80-\xb7\xbe\xbf]|\xf0\x90\xa8[\x80-\x83\x85\x86\x8c-\x93\x95-\x97\x99-\xb3\xb8-\xba\xbf]|\xf0\x90\xa9[\xa0-\xbc]|\xf0\x90\xaa[\x80-\x9c]|\xf0\x90\xab[\x80-\x87\x89-\xa6]|\xf0\x90\xac[\x80-\xb5]|\xf0\x90\xad[\x80-\x95\xa0-\xb2]|\xf0\x90\xae[\x80-\x91]|\xf0\x90\xb1[\x80-\x88]|\xf0\x90\xb2[\x80-\xb2]|\xf0\x90\xb3[\x80-\xb2]|\xf0\x90[\x82\x90\x91\x98-\x9b\xb0][\x80-\xbf]|\xf0\x91\x81[\x80-\x86\xa7-\xaf\xbf\xa6] +UnicodeIDContinue_7 \xf0\x91\x82[\x80-\xba]|\xf0\x91\x83[\x90-\xa8\xb0-\xb9]|\xf0\x91\x84[\x80-\xb4\xb6-\xbf]|\xf0\x91\x85[\x90-\xb3\xb6]|\xf0\x91\x87[\x80-\x84\x8a-\x8c\x90-\x9a\x9c]|\xf0\x91\x88[\x80-\x91\x93-\xb7]|\xf0\x91\x8a[\x80-\x86\x88\x8a-\x8d\x8f-\x9d\x9f-\xa8\xb0-\xbf]|\xf0\x91\x8b[\x80-\xaa\xb0-\xb9]|\xf0\x91\x8c[\x80-\x83\x85-\x8c\x8f\x90\x93-\xa8\xaa-\xb0\xb2\xb3\xb5-\xb9\xbc-\xbf]|\xf0\x91\x8d[\x80-\x84\x87\x88\x8b-\x8d\x90\x97\x9d-\xa3\xa6-\xac\xb0-\xb4]|\xf0\x91\x93[\x80-\x85\x87\x90-\x99]|\xf0\x91\x96[\x80-\xb5\xb8-\xbf]|\xf0\x91\x97[\x80\x98-\x9d]|\xf0\x91\x99[\x80\x84\x90-\x99]|\xf0\x91\x9a[\x80-\xb7]|\xf0\x91\x9b[\x80-\x89]|\xf0\x91\x9c[\x80-\x99\x9d-\xab\xb0-\xb9]|\xf0\x91\xa2[\xa0-\xbf]|\xf0\x91\xa3[\x80-\xa9\xbf]|\xf0\x91\xab[\x80-\xb8]|\xf0\x91[\x80\x86\x92\x98][\x80-\xbf]|\xf0\x92\x8e[\x80-\x99]|\xf0\x92\x91[\x80-\xae]|\xf0\x92\x95[\x80-\x83]|\xf0\x92[\x80-\x8d\x90\x92-\x94][\x80-\xbf]|\xf0\x93\x90[\x80-\xae]|\xf0\x93[\x80-\x8f][\x80-\xbf]|\xf0\x94\x99[\x80-\x86]|\xf0\x94[\x90-\x98][\x80-\xbf] +UnicodeIDContinue_8 \xf0\x96\xa8[\x80-\xb8]|\xf0\x96\xa9[\x80-\x9e\xa0-\xa9]|\xf0\x96\xab[\x90-\xad\xb0-\xb4]|\xf0\x96\xac[\x80-\xb6]|\xf0\x96\xad[\x80-\x83\x90-\x99\xa3-\xb7\xbd-\xbf]|\xf0\x96\xae[\x80-\x8f]|\xf0\x96\xbd[\x80-\x84\x90-\xbe]|\xf0\x96\xbe[\x8f-\x9f]|\xf0\x96[\xa0-\xa7\xbc][\x80-\xbf]|\xf0\x9b\x80[\x80\x81]|\xf0\x9b\xb2[\x80-\x88\x90-\x99\x9d\x9e]|\xf0\x9b\xb1[\x80-\xaa\xb0-\xbc]|\xf0\x9b[\xb0][\x80-\xbf]|\xf0\x9d\x85[\xa5-\xa9\xad-\xb2\xbb-\xbf]|\xf0\x9d\x86[\x80-\x82\xab\x85-\x8b\xac\xad\xaa]|\xf0\x9d\x89[\x82-\x84]|\xf0\x9d\x91[\x80-\x94\x96-\xbf]|\xf0\x9d\x92[\x80-\x9c\x9e\x9f\xa2\xa5\xa6\xa9-\xac\xae-\xb9\xbb\xbd-\xbf]|\xf0\x9d\x93[\x80-\x83\x85-\xbf]|\xf0\x9d\x94[\x80-\x85\x87-\x8a\x8d-\x94\x96-\x9c\x9e-\xb9\xbb-\xbe]|\xf0\x9d\x95[\x80-\x84\x86\x8a-\x90\x92-\xbf]|\xf0\x9d\x9a[\x80-\xa5\xa8-\xbf]|\xf0\x9d\x9b[\x80\x82-\x9a\x9c-\xba\xbc-\xbf]|\xf0\x9d\x9c[\x80-\x94\x96-\xb4\xb6-\xbf]|\xf0\x9d\x9d[\x80-\x8e\x90-\xae\xb0-\xbf]|\xf0\x9d\x9e[\x80-\x88\x8a-\xa8\xaa-\xbf]|\xf0\x9d\x9f[\x80-\x82\x84-\x8b\x8e-\xbf] +UnicodeIDContinue {UnicodeIDContinue_0}|{UnicodeIDContinue_1}|{UnicodeIDContinue_2}|{UnicodeIDContinue_3}|{UnicodeIDContinue_4}|{UnicodeIDContinue_5}|{UnicodeIDContinue_6}|{UnicodeIDContinue_7}|{UnicodeIDContinue_8}|\xf0\x9d\xa8[\x80-\xb6\xbb-\xbf]|\xf0\x9d\xa9[\x80-\xac\xb5]|\xf0\x9d\xaa[\xa1-\xa3\x84\xa5-\xaf\xa4\x9b-\x9f]|\xf0\x9d[\x90\x96-\x99][\x80-\xbf]|\xf0\x9e\xa3[\x80-\x84\x90-\x96]|\xf0\x9e\xb8[\x80-\x83\x85-\x9f\xa1\xa2\xa4\xa7\xa9-\xb2\xb4-\xb7\xb9\xbb]|\xf0\x9e\xb9[\x82\x87\x89\x8b\x8d-\x8f\x91\x92\x94\x97\x99\x9b\x9d\x9f\xa1\xa2\xa4\xa7-\xaa\xac-\xb2\xb4-\xb7\xb9-\xbc\xbe]|\xf0\x9e\xba[\x80-\x89\x8b-\x9b\xa1-\xa3\xa5-\xa9\xab-\xbb]|\xf0\x9e[\xa0-\xa2][\x80-\xbf]|\xf0\xaa\x9b[\x80-\x96]|\xf0\xaa[\x80-\x9a\x9c-\xbf][\x80-\xbf]|\xf0\xab\x9c[\x80-\xb4]|\xf0\xab\xa0[\x80-\x9d\xa0-\xbf]|\xf0\xab[\x80-\x9b\x9d-\x9f\xa1-\xbf][\x80-\xbf]|\xf0\xac\xba[\x80-\xa1]|\xf0\xac[\x80-\xb9][\x80-\xbf]|\xf0\xaf\xa8[\x80-\x9d]|\xf0\xaf[\xa0-\xa7][\x80-\xbf]|\xf0[\xa0-\xa9][\x80-\xbf][\x80-\xbf]|\xf3\xa0\x87[\x80-\xaf]|\xf3\xa0[\x84-\x86][\x80-\xbf] diff --git a/UnicodeIDStart.l b/UnicodeIDStart.l new file mode 100644 index 0000000..3b0882a --- /dev/null +++ b/UnicodeIDStart.l @@ -0,0 +1,9 @@ +UnicodeIDStart_0 [\x41-\x5a\x61-\x7a]|\xc2[\xaa\xb5\xba]|\xc3[\x80-\x96\x98-\xb6\xb8-\xbf]|\xcb[\x80\x81\xa2\xa3\xa1\x86-\x91\xac\xae\xa0\xa4]|\xcd[\xb0-\xb4\xb6\xb7\xba-\xbd\xbf]|\xce[\x86\x88-\x8a\x8c\x8e-\xa1\xa3-\xbf]|\xcf[\x80-\xb5\xb7-\xbf]|\xd2[\x80\x81\x8a-\xbf]|\xd4[\x80-\xaf\xb1-\xbf]|\xd5[\x80-\x96\x99\xa1-\xbf]|\xd6[\x80-\x87]|\xd7[\x90-\xaa\xb0-\xb2]|\xd8[\xa0-\xbf]|\xd9[\x80-\x8a\xae\xaf\xb1-\xbf]|\xdb[\x80-\x93\x95\xa5\xa6\xae\xaf\xba-\xbc\xbf]|\xdc[\x90\x92-\xaf]|\xdd[\x8d-\xbf]|\xde[\x80-\xa5\xb1]|\xdf[\x8a-\xaa\xb4\xb5\xba]|[\xc4-\xca\xd0\xd1\xd3\xda][\x80-\xbf]|\xe0\xa0[\x80-\x95\x9a\xa4\xa8]|\xe0\xa1[\x80-\x98]|\xe0\xa2[\xa0-\xb4]|\xe0\xa4[\x84-\xb9\xbd]|\xe0\xa5[\x90\x98-\xa1\xb1-\xbf]|\xe0\xa6[\x80\x85-\x8c\x8f\x90\x93-\xa8\xaa-\xb0\xb2\xb6-\xb9\xbd]|\xe0\xa7[\xa0\xa1\x8e\xb0\xb1\x9c\x9d\x9f]|\xe0\xa8[\x85-\x8a\x8f\x90\x93-\xa8\xaa-\xb0\xb2\xb3\xb5\xb6\xb8\xb9]|\xe0\xa9[\xb2-\xb4\x99-\x9c\x9e]|\xe0\xaa[\x85-\x8d\x8f-\x91\x93-\xa8\xaa-\xb0\xb2\xb3\xb5-\xb9\xbd]|\xe0\xab[\x90\xa0\xb9\xa1] +UnicodeIDStart_1 \xe0\xac[\x85-\x8c\x8f\x90\x93-\xa8\xaa-\xb0\xb2\xb3\xb5-\xb9\xbd]|\xe0\xad[\xa0\xa1\xb1\x9c\x9d\x9f]|\xe0\xae[\x83\x85-\x8a\x8e-\x90\x92-\x95\x99\x9a\x9c\x9e\x9f\xa3\xa4\xa8-\xaa\xae-\xb9]|\xe0\xaf[\x90]|\xe0\xb0[\x85-\x8c\x8e-\x90\x92-\xa8\xaa-\xb9\xbd]|\xe0\xb1[\x98-\x9a\xa0\xa1]|\xe0\xb2[\x85-\x8c\x8e-\x90\x92-\xa8\xaa-\xb3\xb5-\xb9\xbd]|\xe0\xb3[\xa0\xa1\xb2\x9e\xb1]|\xe0\xb4[\x85-\x8c\x8e-\x90\x92-\xba\xbd]|\xe0\xb5[\xa0\xa1\xbf\x8e\xba-\xbe\x9f]|\xe0\xb6[\x85-\x96\x9a-\xb1\xb3-\xbb\xbd]|\xe0\xb7[\x80-\x86]|\xe0\xb8[\x81-\xb0\xb2\xb3]|\xe0\xb9[\x80-\x86]|\xe0\xba[\x81\x82\x84\x87\x88\x8a\x8d\x94-\x97\x99-\x9f\xa1-\xa3\xa5\xa7\xaa\xab\xad-\xb0\xb2\xb3\xbd]|\xe0\xbb[\x80-\x84\x86\x9c-\x9f]|\xe0\xbc[\x80]|\xe0\xbd[\x80-\x87\x89-\xac]|\xe0\xbe[\x88-\x8c]|\xe1\x80[\x80-\xaa\xbf]|\xe1\x81[\x90-\x95\x9a-\x9d\xa1\xa5\xa6\xae-\xb0\xb5-\xbf]|\xe1\x82[\x80\x81\x8e\xa0-\xbf]|\xe1\x83[\x80-\x85\x87\x8d\x90-\xba\xbc-\xbf]|\xe1\x89[\x80-\x88\x8a-\x8d\x90-\x96\x98\x9a-\x9d\xa0-\xbf]|\xe1\x8a[\x80-\x88\x8a-\x8d\x90-\xb0\xb2-\xb5\xb8-\xbe] +UnicodeIDStart_2 \xe1\x8b[\x80\x82-\x85\x88-\x96\x98-\xbf]|\xe1\x8c[\x80-\x90\x92-\x95\x98-\xbf]|\xe1\x8d[\x80-\x9a]|\xe1\x8e[\x80-\x8f\xa0-\xbf]|\xe1\x8f[\x80-\xb5\xb8-\xbd]|\xe1\x90[\x81-\xbf]|\xe1\x99[\x80-\xac\xaf-\xbf]|\xe1\x9a[\x81-\x9a\xa0-\xbf]|\xe1\x9b[\x80-\xaa\xae-\xb8]|\xe1\x9c[\x80-\x8c\x8e-\x91\xa0-\xb1]|\xe1\x9d[\x80-\x91\xa0-\xac\xae-\xb0]|\xe1\x9e[\x80-\xb3]|\xe1\x9f[\x9c\x97]|\xe1\xa0[\xa0-\xbf]|\xe1\xa1[\x80-\xb7]|\xe1\xa2[\x80-\xa8\xaa\xb0-\xbf]|\xe1\xa3[\x80-\xb5]|\xe1\xa4[\x80-\x9e]|\xe1\xa5[\x90-\xad\xb0-\xb4]|\xe1\xa6[\x80-\xab\xb0-\xbf]|\xe1\xa7[\x80-\x89]|\xe1\xa8[\x80-\x96\xa0-\xbf]|\xe1\xa9[\x80-\x94]|\xe1\xaa[\xa7]|\xe1\xac[\x85-\xb3]|\xe1\xad[\x85-\x8b]|\xe1\xae[\x83-\xa0\xae\xaf\xba-\xbf]|\xe1\xaf[\x80-\xa5]|\xe1\xb0[\x80-\xa3]|\xe1\xb1[\x8d-\x8f\x9a-\xbd]|\xe1\xb3[\xa9-\xac\xae-\xb1\xb5\xb6]|\xe1\xbc[\x80-\x95\x98-\x9d\xa0-\xbf]|\xe1\xbd[\x80-\x85\x88-\x8d\x90-\x97\x99\x9b\x9d\x9f-\xbd]|\xe1\xbe[\x80-\xb4\xb6-\xbc\xbe]|\xe1\xbf[\x82-\x84\x86-\x8c\x90-\x93\x96-\x9b\xa0-\xac\xb2-\xb4\xb6-\xbc] +UnicodeIDStart_3 \xe1[\x84-\x88\x91-\x98\xb4-\xb6\xb8-\xbb][\x80-\xbf]|\xe2\x81[\xb1\xbf]|\xe2\x82[\x90-\x9c]|\xe2\x84[\x82\x87\x8a-\x93\x95\x98-\x9d\xa4\xa6\xa8\xaa-\xb9\xbc-\xbf]|\xe2\x85[\x85-\x89\x8e\xa0-\xbf]|\xe2\x86[\x80-\x88]|\xe2\xb0[\x80-\xae\xb0-\xbf]|\xe2\xb1[\x80-\x9e\xa0-\xbf]|\xe2\xb3[\x80-\xa4\xab-\xae\xb2\xb3]|\xe2\xb4[\x80-\xa5\xa7\xad\xb0-\xbf]|\xe2\xb5[\x80-\xa7\xaf]|\xe2\xb6[\x80-\x96\xa0-\xa6\xa8-\xae\xb0-\xb6\xb8-\xbe]|\xe2\xb7[\x80-\x86\x88-\x8e\x90-\x96\x98-\x9e]|\xe2[\xb2][\x80-\xbf]|\xe3\x80[\x85-\x87\xa1-\xa9\xb1-\xb5\xb8-\xbc]|\xe3\x81[\x81-\xbf]|\xe3\x82[\x80-\x96\x9b-\x9f\xa1-\xbf]|\xe3\x83[\x80-\xba\xbc-\xbf]|\xe3\x84[\x85-\xad\xb1-\xbf]|\xe3\x86[\x80-\x8e\xa0-\xba]|\xe3\x87[\xb0-\xbf]|\xe3[\x85\x90-\xbf][\x80-\xbf]|\xe4\xb6[\x80-\xb5]|\xe4[\x80-\xb5\xb8-\xbf][\x80-\xbf]|\xe9\xbf[\x80-\x95]|\xe9[\x80-\xbe][\x80-\xbf]|\xea\x92[\x80-\x8c]|\xea\x93[\x90-\xbd]|\xea\x98[\x80-\x8c\x90-\x9f\xaa\xab]|\xea\x99[\x80-\xae\xbf]|\xea\x9a[\x80-\x9d\xa0-\xbf]|\xea\x9b[\x80-\xaf]|\xea\x9c[\x97-\x9f\xa2-\xbf] +UnicodeIDStart_4 \xea\x9e[\x80-\x88\x8b-\xad\xb0-\xb7]|\xea\x9f[\xb7-\xbf]|\xea\xa0[\x80\x81\x83-\x85\x87-\x8a\x8c-\xa2]|\xea\xa1[\x80-\xb3]|\xea\xa2[\x82-\xb3]|\xea\xa3[\xb2-\xb7\xbb\xbd]|\xea\xa4[\x8a-\xa5\xb0-\xbf]|\xea\xa5[\x80-\x86\xa0-\xbc]|\xea\xa6[\x84-\xb2]|\xea\xa7[\xa0-\xa4\xa6-\xae\x8f\xbb\xba\xaf\xbc-\xbe]|\xea\xa8[\x80-\xa8]|\xea\xa9[\x80-\x82\x84-\x8b\xa0-\xb6\xba\xbe\xbf]|\xea\xaa[\x80-\xaf\xb1\xb5\xb6\xb9-\xbd]|\xea\xab[\x80\xa0\x82\xa3-\xa6\xa1\xa8-\xaa\xa7\xa2\xb2-\xb4\x9b-\x9d]|\xea\xac[\x81-\x86\x89-\x8e\x91-\x96\xa0-\xa6\xa8-\xae\xb0-\xbf]|\xea\xad[\x80-\x9a\x9c-\xa5\xb0-\xbf]|\xea\xaf[\x80-\xa2]|\xea[\x80-\x91\x94-\x97\x9d\xae\xb0-\xbf][\x80-\xbf]|\xed\x9e[\x80-\xa3\xb0-\xbf]|\xed\x9f[\x80-\x86\x8b-\xbb]|\xed[\x80-\x9d][\x80-\xbf]|\xef\xa9[\x80-\xad\xb0-\xbf]|\xef\xab[\x80-\x99]|\xef\xac[\x80-\x86\x93-\x97\x9d\x9f-\xa8\xaa-\xb6\xb8-\xbc\xbe]|\xef\xad[\x80\x81\x83\x84\x86-\xbf]|\xef\xae[\x80-\xb1]|\xef\xaf[\x93-\xbf]|\xef\xb4[\x80-\xbd]|\xef\xb5[\x90-\xbf]|\xef\xb6[\x80-\x8f\x92-\xbf] +UnicodeIDStart_5 \xef\xb7[\x80-\x87\xb0-\xbb]|\xef\xb9[\xb0-\xb4\xb6-\xbf]|\xef\xbb[\x80-\xbc]|\xef\xbc[\xa1-\xba]|\xef\xbd[\x81-\x9a\xa6-\xbf]|\xef\xbe[\x80-\xbe]|\xef\xbf[\x82-\x87\x8a-\x8f\x92-\x97\x9a-\x9c]|\xef[\xa4-\xa8\xaa\xb0-\xb3\xba][\x80-\xbf]|[\xe5-\xe8\xeb\xec][\x80-\xbf][\x80-\xbf]|\xf0\x90\x80[\x80-\x8b\x8d-\xa6\xa8-\xba\xbc\xbd\xbf]|\xf0\x90\x81[\x80-\x8d\x90-\x9d]|\xf0\x90\x83[\x80-\xba]|\xf0\x90\x85[\x80-\xb4]|\xf0\x90\x8a[\x80-\x9c\xa0-\xbf]|\xf0\x90\x8b[\x80-\x90]|\xf0\x90\x8c[\x80-\x9f\xb0-\xbf]|\xf0\x90\x8d[\x80-\x8a\x90-\xb5]|\xf0\x90\x8e[\x80-\x9d\xa0-\xbf]|\xf0\x90\x8f[\x80-\x83\x88-\x8f\x91-\x95]|\xf0\x90\x92[\x80-\x9d]|\xf0\x90\x94[\x80-\xa7\xb0-\xbf]|\xf0\x90\x95[\x80-\xa3]|\xf0\x90\x9c[\x80-\xb6]|\xf0\x90\x9d[\x80-\x95\xa0-\xa7]|\xf0\x90\xa0[\x80-\x85\x88\x8a-\xb5\xb7\xb8\xbc\xbf]|\xf0\x90\xa1[\x80-\x95\xa0-\xb6]|\xf0\x90\xa2[\x80-\x9e]|\xf0\x90\xa3[\xa0-\xb2\xb4\xb5]|\xf0\x90\xa4[\x80-\x95\xa0-\xb9]|\xf0\x90\xa6[\x80-\xb7\xbe\xbf]|\xf0\x90\xa8[\x80\x90-\x93\x95-\x97\x99-\xb3] +UnicodeIDStart_6 \xf0\x90\xa9[\xa0-\xbc]|\xf0\x90\xaa[\x80-\x9c]|\xf0\x90\xab[\x80-\x87\x89-\xa4]|\xf0\x90\xac[\x80-\xb5]|\xf0\x90\xad[\x80-\x95\xa0-\xb2]|\xf0\x90\xae[\x80-\x91]|\xf0\x90\xb1[\x80-\x88]|\xf0\x90\xb2[\x80-\xb2]|\xf0\x90\xb3[\x80-\xb2]|\xf0\x90[\x82\x90\x91\x98-\x9b\xb0][\x80-\xbf]|\xf0\x91\x80[\x83-\xb7]|\xf0\x91\x82[\x83-\xaf]|\xf0\x91\x83[\x90-\xa8]|\xf0\x91\x84[\x83-\xa6]|\xf0\x91\x85[\x90-\xb2\xb6]|\xf0\x91\x86[\x83-\xb2]|\xf0\x91\x87[\x81-\x84\x9a\x9c]|\xf0\x91\x88[\x80-\x91\x93-\xab]|\xf0\x91\x8a[\x80-\x86\x88\x8a-\x8d\x8f-\x9d\x9f-\xa8\xb0-\xbf]|\xf0\x91\x8b[\x80-\x9e]|\xf0\x91\x8c[\x85-\x8c\x8f\x90\x93-\xa8\xaa-\xb0\xb2\xb3\xb5-\xb9\xbd]|\xf0\x91\x8d[\xa0\xa1\x90\x9d-\x9f]|\xf0\x91\x92[\x80-\xaf]|\xf0\x91\x93[\x84\x85\x87]|\xf0\x91\x96[\x80-\xae]|\xf0\x91\x97[\x98-\x9b]|\xf0\x91\x98[\x80-\xaf]|\xf0\x91\x99[\x84]|\xf0\x91\x9a[\x80-\xaa]|\xf0\x91\x9c[\x80-\x99]|\xf0\x91\xa2[\xa0-\xbf]|\xf0\x91\xa3[\x80-\x9f\xbf]|\xf0\x91\xab[\x80-\xb8]|\xf0\x92\x8e[\x80-\x99]|\xf0\x92\x91[\x80-\xae] +UnicodeIDStart_7 \xf0\x92\x95[\x80-\x83]|\xf0\x92[\x80-\x8d\x90\x92-\x94][\x80-\xbf]|\xf0\x93\x90[\x80-\xae]|\xf0\x93[\x80-\x8f][\x80-\xbf]|\xf0\x94\x99[\x80-\x86]|\xf0\x94[\x90-\x98][\x80-\xbf]|\xf0\x96\xa8[\x80-\xb8]|\xf0\x96\xa9[\x80-\x9e]|\xf0\x96\xab[\x90-\xad]|\xf0\x96\xac[\x80-\xaf]|\xf0\x96\xad[\x80-\x83\xa3-\xb7\xbd-\xbf]|\xf0\x96\xae[\x80-\x8f]|\xf0\x96\xbd[\x80-\x84\x90]|\xf0\x96\xbe[\x93-\x9f]|\xf0\x96[\xa0-\xa7\xbc][\x80-\xbf]|\xf0\x9b\x80[\x80\x81]|\xf0\x9b\xb2[\x80-\x88\x90-\x99]|\xf0\x9b\xb1[\x80-\xaa\xb0-\xbc]|\xf0\x9b[\xb0][\x80-\xbf]|\xf0\x9d\x91[\x80-\x94\x96-\xbf]|\xf0\x9d\x92[\x80-\x9c\x9e\x9f\xa2\xa5\xa6\xa9-\xac\xae-\xb9\xbb\xbd-\xbf]|\xf0\x9d\x93[\x80-\x83\x85-\xbf]|\xf0\x9d\x94[\x80-\x85\x87-\x8a\x8d-\x94\x96-\x9c\x9e-\xb9\xbb-\xbe]|\xf0\x9d\x95[\x80-\x84\x86\x8a-\x90\x92-\xbf]|\xf0\x9d\x9a[\x80-\xa5\xa8-\xbf]|\xf0\x9d\x9b[\x80\x82-\x9a\x9c-\xba\xbc-\xbf]|\xf0\x9d\x9c[\x80-\x94\x96-\xb4\xb6-\xbf]|\xf0\x9d\x9d[\x80-\x8e\x90-\xae\xb0-\xbf]|\xf0\x9d\x9e[\x80-\x88\x8a-\xa8\xaa-\xbf] +UnicodeIDStart {UnicodeIDStart_0}|{UnicodeIDStart_1}|{UnicodeIDStart_2}|{UnicodeIDStart_3}|{UnicodeIDStart_4}|{UnicodeIDStart_5}|{UnicodeIDStart_6}|{UnicodeIDStart_7}|\xf0\x9d\x9f[\x80-\x82\x84-\x8b]|\xf0\x9d[\x90\x96-\x99][\x80-\xbf]|\xf0\x9e\xa3[\x80-\x84]|\xf0\x9e\xb8[\x80-\x83\x85-\x9f\xa1\xa2\xa4\xa7\xa9-\xb2\xb4-\xb7\xb9\xbb]|\xf0\x9e\xb9[\x82\x87\x89\x8b\x8d-\x8f\x91\x92\x94\x97\x99\x9b\x9d\x9f\xa1\xa2\xa4\xa7-\xaa\xac-\xb2\xb4-\xb7\xb9-\xbc\xbe]|\xf0\x9e\xba[\x80-\x89\x8b-\x9b\xa1-\xa3\xa5-\xa9\xab-\xbb]|\xf0\x9e[\xa0-\xa2][\x80-\xbf]|\xf0\xaa\x9b[\x80-\x96]|\xf0\xaa[\x80-\x9a\x9c-\xbf][\x80-\xbf]|\xf0\xab\x9c[\x80-\xb4]|\xf0\xab\xa0[\x80-\x9d\xa0-\xbf]|\xf0\xab[\x80-\x9b\x9d-\x9f\xa1-\xbf][\x80-\xbf]|\xf0\xac\xba[\x80-\xa1]|\xf0\xac[\x80-\xb9][\x80-\xbf]|\xf0\xaf\xa8[\x80-\x9d]|\xf0\xaf[\xa0-\xa7][\x80-\xbf]|\xf0[\xa0-\xa9][\x80-\xbf][\x80-\xbf] diff --git a/backtrack.sh b/backtrack.sh new file mode 100755 index 0000000..a1eb8b4 --- /dev/null +++ b/backtrack.sh @@ -0,0 +1,3 @@ +#!/bin/bash +./apple-make.sh +grep '^State' build.osx-i386/lex.backup | wc -l diff --git a/unicode.mk b/unicode.mk new file mode 100644 index 0000000..dc88024 --- /dev/null +++ b/unicode.mk @@ -0,0 +1,38 @@ +# Cycript - Optimizing JavaScript Compiler/Runtime +# Copyright (C) 2009-2015 Jay Freeman (saurik) + +# GNU Affero General Public License, Version 3 {{{ +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# }}} + +.DELETE_ON_ERROR: + +unicode := unicode.sh unicode.py + +unicode += DerivedCoreProperties.txt +unicode += PropList.txt + +all: UnicodeIDStart.l UnicodeIDContinue.l + +%.txt: + wget -qc http://www.unicode.org/Public/UCD/latest/ucd/$@ + +UnicodeIDStart.l: $(unicode) + ./unicode.sh UnicodeIDStart ID_Start DerivedCoreProperties.txt Other_ID_Start PropList.txt >$@ + +UnicodeIDContinue.l: $(unicode) + ./unicode.sh UnicodeIDContinue ID_Continue DerivedCoreProperties.txt Other_ID_Continue PropList.txt >$@ + +.PHONY: all diff --git a/unicode.py b/unicode.py new file mode 100755 index 0000000..eebc83f --- /dev/null +++ b/unicode.py @@ -0,0 +1,115 @@ +#!/usr/bin/python + +# Cycript - Optimizing JavaScript Compiler/Runtime +# Copyright (C) 2009-2015 Jay Freeman (saurik) + +# GNU Affero General Public License, Version 3 {{{ +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# }}} + +import sys + +trees = [dict(), dict(), dict(), dict()] + +for line in sys.stdin: + line = line[0:14] + line = line.rstrip(' \n') + line = line.split('..') + if len(line) == 1: + line.append(line[0]) + line = [int(end, 16) for end in line] + for point in range(line[0], line[1] + 1): + # http://stackoverflow.com/questions/7105874/ + point = "\\U%08x" % point + point = point.decode('unicode-escape') + point = point.encode('utf-8') + point = list(point) + tree = trees[len(point) - 1] + for unit in point: + unit = ord(unit) + tree = tree.setdefault(unit, dict()) + +items = [] + +def build(index, tree, units): + if index == 0: + keys = tree.keys() + else: + keys = [] + for unit, tree in tree.iteritems(): + if build(index - 1, tree, units + [unit]): + keys.append(unit) + + if len(keys) == 0: + return False + if len(keys) == 0xc0 - 0x80: + return True + + item = '' + for unit in units: + item += '\\x%02x' % unit + item += '[' + + first = -1 + last = -1 + + assert len(keys) != 0 + for unit in keys + [-1]: + if unit != -1: + if first == -1: + first = unit + last = unit + continue + if unit == last + 1: + last = unit + continue + + item += '\\x%02x' % first + if first != last: + if last != first + 1: + item += '-' + item += '\\x%02x' % last + + first = unit + last = unit + + item += ']' + + for i in range(0, index): + item += '[\\x80-\\xbf]' + + items.append(item) + return False + +for index, tree in enumerate(trees): + build(index, tree, []) + +name = sys.argv[1] +parts = [] +part = [] +length = 0 +index = 0 +for item in items: + part += [item] + length += len(item) + 1 + if length > 1000: + indexed = name + '_' + str(index) + index += 1 + print indexed, '|'.join(part) + parts += ['{' + indexed + '}'] + part = [] + length = 0 +parts += part +print name, '|'.join(parts) diff --git a/unicode.sh b/unicode.sh new file mode 100755 index 0000000..3f970a2 --- /dev/null +++ b/unicode.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Cycript - Optimizing JavaScript Compiler/Runtime +# Copyright (C) 2009-2015 Jay Freeman (saurik) + +# GNU Affero General Public License, Version 3 {{{ +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# }}} + +set -e + +name=$1 +shift 1 + +while [[ $# != 0 ]]; do + prop=$1 + data=$2 + shift 2 + grep -F "; ${prop} #" "${data}" +done | ./unicode.py "${name}" -- 2.47.2