]> git.saurik.com Git - cycript.git/blob - unicode-l.py
Verify lexer tokens are always default reductions.
[cycript.git] / unicode-l.py
1 #!/usr/bin/python
2
3 # Cycript - Optimizing JavaScript Compiler/Runtime
4 # Copyright (C) 2009-2015 Jay Freeman (saurik)
5
6 # GNU Affero General Public License, Version 3 {{{
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU Affero General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU Affero General Public License for more details.
17 #
18 # You should have received a copy of the GNU Affero General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 # }}}
21
22 import sys
23
24 escape = False
25
26 trees = [dict(), dict(), dict(), dict(), dict()]
27
28 def insert(point):
29 point = list(point)
30 tree = trees[len(point) - 1]
31 for unit in point:
32 unit = ord(unit)
33 tree = tree.setdefault(unit, dict())
34
35 def insertmore(point, prefix=''):
36 if len(point) == 0:
37 return insert(prefix)
38
39 next = point[0]
40 point = point[1:]
41 insertmore(point, prefix + next)
42
43 upper = next.upper()
44 if upper != next:
45 insertmore(point, prefix + upper)
46
47 for line in sys.stdin:
48 line = line[0:14]
49 line = line.rstrip(' \n')
50 line = line.split('..')
51 if len(line) == 1:
52 line.append(line[0])
53 line = [int(end, 16) for end in line]
54 for point in range(line[0], line[1] + 1):
55 if escape:
56 point = format(point, 'x')
57 insertmore(point)
58 else:
59 # http://stackoverflow.com/questions/7105874/
60 point = "\\U%08x" % point
61 point = point.decode('unicode-escape')
62 point = point.encode('utf-8')
63 insert(point)
64
65 items = []
66
67 def encode(value):
68 if escape:
69 if ord('A') <= value <= ord('Z') or ord('a') <= value <= ord('z') or ord('0') <= value <= ord('9'):
70 return chr(value)
71 return '\\x%02x' % value
72
73 def build(index, tree, units, wrap=()):
74 if index == 0:
75 keys = sorted(tree.keys())
76 else:
77 keys = []
78 for unit, tree in sorted(tree.items()):
79 if build(index - 1, tree, units + [unit], wrap):
80 keys.append(unit)
81
82 if len(keys) == 0:
83 return False
84
85 if escape:
86 if len(keys) == 10 + 6 + 6:
87 return True
88 else:
89 if len(keys) == 0xc0 - 0x80:
90 return True
91
92 item = ''
93 for unit in units:
94 item += encode(unit)
95 item += '['
96
97 first = -1
98 last = -1
99
100 assert len(keys) != 0
101 for unit in keys + [-1]:
102 if unit != -1:
103 if first == -1:
104 first = unit
105 last = unit
106 continue
107 if unit == last + 1:
108 last = unit
109 continue
110
111 item += encode(first)
112 if first != last:
113 if last != first + 1:
114 item += '-'
115 item += encode(last)
116
117 first = unit
118 last = unit
119
120 item += ']'
121
122 if index != 0:
123 if escape:
124 item += '[0-9A-Fa-f]'
125 else:
126 item += '[\\x80-\\xbf]'
127 if index != 1:
128 item += '{' + str(index) + '}'
129
130 if False:
131 item = item.replace('[\\x00-\\x7f]', '{U1}')
132 item = item.replace('[\\x80-\\xbf]', '{U0}')
133 item = item.replace('[\\xc2-\\xdf]', '{U2}')
134 item = item.replace('[\\xe0-\\xef]', '{U3}')
135 item = item.replace('[\\xf0-\\xf4]', '{U4}')
136
137 count = len(units) + 1 + index
138 if wrap == ():
139 if not escape:
140 wrap = ('', '')
141 elif count > 4:
142 return False
143 else:
144 wrap = ('0' * (4 - count), '')
145
146 items.append(wrap[0] + item + wrap[1])
147 return False
148
149 for index, tree in enumerate(trees):
150 build(index, tree, [])
151 if escape:
152 build(index, tree, [], ('\\{0*', '\\}'))
153
154 name = sys.argv[1]
155 parts = []
156 part = []
157 length = 0
158 index = 0
159 for item in items:
160 part += [item]
161 length += len(item) + 1
162 if length > 1000:
163 indexed = name + '_' + str(index)
164 index += 1
165 print indexed, '|'.join(part)
166 parts += ['{' + indexed + '}']
167 part = []
168 length = 0
169 parts += part
170 print name, '|'.join(parts)