]> git.saurik.com Git - cycript.git/blob - unicode.py
Move LexSetRegEx to new "post-lookahead" paradigm.
[cycript.git] / unicode.py
1 #!/usr/bin/python
2
3 # Cycript - Optimizing JavaScript Compiler/Runtime
4 # Copyright (C) 2009-2015 Jay Freeman (saurik)
5
6 # GNU Affero General Public License, Version 3 {{{
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU Affero General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU Affero General Public License for more details.
17 #
18 # You should have received a copy of the GNU Affero General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 # }}}
21
22 import sys
23
24 trees = [dict(), dict(), dict(), dict()]
25
26 for line in sys.stdin:
27 line = line[0:14]
28 line = line.rstrip(' \n')
29 line = line.split('..')
30 if len(line) == 1:
31 line.append(line[0])
32 line = [int(end, 16) for end in line]
33 for point in range(line[0], line[1] + 1):
34 # http://stackoverflow.com/questions/7105874/
35 point = "\\U%08x" % point
36 point = point.decode('unicode-escape')
37 point = point.encode('utf-8')
38 point = list(point)
39 tree = trees[len(point) - 1]
40 for unit in point:
41 unit = ord(unit)
42 tree = tree.setdefault(unit, dict())
43
44 items = []
45
46 def build(index, tree, units):
47 if index == 0:
48 keys = tree.keys()
49 else:
50 keys = []
51 for unit, tree in tree.iteritems():
52 if build(index - 1, tree, units + [unit]):
53 keys.append(unit)
54
55 if len(keys) == 0:
56 return False
57 if len(keys) == 0xc0 - 0x80:
58 return True
59
60 item = ''
61 for unit in units:
62 item += '\\x%02x' % unit
63 item += '['
64
65 first = -1
66 last = -1
67
68 assert len(keys) != 0
69 for unit in keys + [-1]:
70 if unit != -1:
71 if first == -1:
72 first = unit
73 last = unit
74 continue
75 if unit == last + 1:
76 last = unit
77 continue
78
79 item += '\\x%02x' % first
80 if first != last:
81 if last != first + 1:
82 item += '-'
83 item += '\\x%02x' % last
84
85 first = unit
86 last = unit
87
88 item += ']'
89
90 for i in range(0, index):
91 item += '[\\x80-\\xbf]'
92
93 if False:
94 item = item.replace('[\\x00-\\x7f]', '{U1}')
95 item = item.replace('[\\x80-\\xbf]', '{U0}')
96 item = item.replace('[\\xc2-\\xdf]', '{U2}')
97 item = item.replace('[\\xe0-\\xef]', '{U3}')
98 item = item.replace('[\\xf0-\\xf4]', '{U4}')
99
100 items.append(item)
101 return False
102
103 for index, tree in enumerate(trees):
104 build(index, tree, [])
105
106 name = sys.argv[1]
107 parts = []
108 part = []
109 length = 0
110 index = 0
111 for item in items:
112 part += [item]
113 length += len(item) + 1
114 if length > 1000:
115 indexed = name + '_' + str(index)
116 index += 1
117 print indexed, '|'.join(part)
118 parts += ['{' + indexed + '}']
119 part = []
120 length = 0
121 parts += part
122 print name, '|'.join(parts)