unicode-l.py

   1 #!/usr/bin/python
   2
   3 # Cycript - Optimizing JavaScript Compiler/Runtime
   4 # Copyright (C) 2009-2015  Jay Freeman (saurik)
   5
   6 # GNU Affero General Public License, Version 3 {{{
   7 #
   8 # This program is free software: you can redistribute it and/or modify
   9 # it under the terms of the GNU Affero General Public License as published by
  10 # the Free Software Foundation, either version 3 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # This program is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU Affero General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU Affero General Public License
  19 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  20 # }}}
  21
  22 import sys
  23
  24 escape = False
  25
  26 trees = [dict(), dict(), dict(), dict(), dict()]
  27
  28 def insert(point):
  29     point = list(point)
  30     tree = trees[len(point) - 1]
  31     for unit in point:
  32         unit = ord(unit)
  33         tree = tree.setdefault(unit, dict())
  34
  35 def insertmore(point, prefix=''):
  36     if len(point) == 0:
  37         return insert(prefix)
  38
  39     next = point[0]
  40     point = point[1:]
  41     insertmore(point, prefix + next)
  42
  43     upper = next.upper()
  44     if upper != next:
  45         insertmore(point, prefix + upper)
  46
  47 for line in sys.stdin:
  48     line = line[0:14]
  49     line = line.rstrip(' \n')
  50     line = line.split('..')
  51     if len(line) == 1:
  52         line.append(line[0])
  53     line = [int(end, 16) for end in line]
  54     for point in range(line[0], line[1] + 1):
  55         if escape:
  56             point = format(point, 'x')
  57             insertmore(point)
  58         else:
  59             # http://stackoverflow.com/questions/7105874/
  60             point = "\\U%08x" % point
  61             point = point.decode('unicode-escape')
  62             point = point.encode('utf-8')
  63             insert(point)
  64
  65 items = []
  66
  67 def encode(value):
  68     if escape:
  69         if ord('A') <= value <= ord('Z') or ord('a') <= value <= ord('z') or ord('0') <= value <= ord('9'):
  70             return chr(value)
  71     return '\\x%02x' % value
  72
  73 def build(index, tree, units, wrap=()):
  74     if index == 0:
  75         keys = sorted(tree.keys())
  76     else:
  77         keys = []
  78         for unit, tree in sorted(tree.items()):
  79             if build(index - 1, tree, units + [unit], wrap):
  80                 keys.append(unit)
  81
  82     if len(keys) == 0:
  83         return False
  84
  85     if escape:
  86         if len(keys) == 10 + 6 + 6:
  87             return True
  88     else:
  89         if len(keys) == 0xc0 - 0x80:
  90             return True
  91
  92     item = ''
  93     for unit in units:
  94         item += encode(unit)
  95     item += '['
  96
  97     first = -1
  98     last = -1
  99
 100     assert len(keys) != 0
 101     for unit in keys + [-1]:
 102         if unit != -1:
 103             if first == -1:
 104                 first = unit
 105                 last = unit
 106                 continue
 107             if unit == last + 1:
 108                 last = unit
 109                 continue
 110
 111         item += encode(first)
 112         if first != last:
 113             if last != first + 1:
 114                 item += '-'
 115             item += encode(last)
 116
 117         first = unit
 118         last = unit
 119
 120     item += ']'
 121
 122     if index != 0:
 123         if escape:
 124             item += '[0-9A-Fa-f]'
 125         else:
 126             item += '[\\x80-\\xbf]'
 127         if index != 1:
 128             item += '{' + str(index) + '}'
 129
 130     if False:
 131         item = item.replace('[\\x00-\\x7f]', '{U1}')
 132         item = item.replace('[\\x80-\\xbf]', '{U0}')
 133         item = item.replace('[\\xc2-\\xdf]', '{U2}')
 134         item = item.replace('[\\xe0-\\xef]', '{U3}')
 135         item = item.replace('[\\xf0-\\xf4]', '{U4}')
 136
 137     count = len(units) + 1 + index
 138     if wrap == ():
 139         if not escape:
 140             wrap = ('', '')
 141         elif count > 4:
 142             return False
 143         else:
 144             wrap = ('0' * (4 - count), '')
 145
 146     items.append(wrap[0] + item + wrap[1])
 147     return False
 148
 149 for index, tree in enumerate(trees):
 150     build(index, tree, [])
 151     if escape:
 152         build(index, tree, [], ('\\{0*', '\\}'))
 153
 154 name = sys.argv[1]
 155 parts = []
 156 part = []
 157 length = 0
 158 index = 0
 159 for item in items:
 160     part += [item]
 161     length += len(item) + 1
 162     if length > 1000:
 163         indexed = name + '_' + str(index)
 164         index += 1
 165         print indexed, '|'.join(part)
 166         parts += ['{' + indexed + '}']
 167         part = []
 168         length = 0
 169 parts += part
 170 print name, '|'.join(parts)