]>
git.saurik.com Git - apple/icu.git/blob - icuSources/tools/escapesrc/tblgen.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 #include "unicode/utypes.h"
5 #include "unicode/ucnv.h"
6 #include "unicode/uniset.h"
9 static const char * kConverter
= "ibm-1047" ;
11 int main ( int argc
, const char * argv
[]) {
12 printf ( "// %s \n " , U_COPYRIGHT_STRING
);
13 printf ( "// generated by tblgen. You weren't going to edit it by hand, were you? \n " );
16 UErrorCode status
= U_ZERO_ERROR
;
17 LocalUConverterPointer
cnv ( ucnv_open ( kConverter
, & status
));
19 if ( U_FAILURE ( status
)) {
20 fprintf ( stderr
, "Failed to open %s : %s \n " , kConverter
, u_errorName ( status
));
24 printf ( "static const char cp1047_8859_1[256] = { \n " );
25 for ( int i
= 0x00 ; i
< 0x100 ; i
++) {
30 const char * source
= cp1047
;
31 ucnv_toUnicode ( cnv
. getAlias (), & target
, u
+ 1 , & source
, cp1047
+ 1 , nullptr , true , & status
);
32 if ( U_FAILURE ( status
)) {
33 fprintf ( stderr
, "Conversion failure at # %X : %s \n " , i
, u_errorName ( status
));
36 printf ( " (char)0x %0 2X, /* %0 2X */ \n " , u
[ 0 ], i
);
41 // UnicodeSet oldIllegal("[:print:]", status); // [a-zA-Z0-9_}{#)(><%:;.?*+-/^&|~!=,\\u005b\\u005d\\u005c]", status);
42 UnicodeSet
oldIllegal ( "[0-9 a-z A-Z "
43 "_ \\ { \\ } \\ [ \\ ] # \\ ( \\ ) < > % \\ : ; . "
44 "? * + \\ - / \\ ^ \\ & | ~ ! = , \\ \" ' ]" , status
);
48 http://www.lirmm.fr/~ducour/Doc-objets/ISO+IEC+14882-1998.pdf ( note: 1998 ) page 10, section 2.2 says:
50 1 The basic source character set consists of 96 characters: the space character, the control characters repre- 15)
51 senting horizontal tab, vertical tab, form feed, and new-line, plus the following 91 graphical characters:
52 a b c d e f g h i j k l m n opqrstuvwxyz
53 A B C D E F G H I J K L M N OPQRSTUVWXYZ
55 _ { } [ ] # ( ) < > % : ; . ?*+-/^&|~!=,\"
56 2 The universal-character-name construct provides a way to name other characters. hex-quad:
57 hexadecimal-digit hexadecimal-digit hexadecimal-digit hexadecimal-digit
58 universal-character-name: \u hex-quad
60 The character designated by the universal-character-name \UNNNNNNNN is that character whose character short name in ISO/IEC 10646 is NNNNNNNN; the character designated by the universal-character-name \uNNNN is that character whose character short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value for a universal character name is less than 0x20 or in the range 0x7F-0x9F (inclusive), or if the uni- versal character name designates a character in the basic source character set, then the program is ill- formed.
63 So basically: printable ASCII plus 0x00-0x1F, 0x7F-0x9F, was all illegal.
65 Some discussion at http://unicode.org/mail-arch/unicode-ml/y2003-m10/0471.html
71 printf ( "static const bool oldIllegal[256] = { \n " );
72 for ( UChar i
= 0x00 ; i
< 0x100 ; i
++) {
73 printf ( " %s , /* U+ %0 4X */ \n " ,
74 ( oldIllegal
. contains ( i
))? " true" : "false" ,