]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (C) 2012 Apple Inc. All rights reserved. | |
3 | * | |
4 | * Redistribution and use in source and binary forms, with or without | |
5 | * modification, are permitted provided that the following conditions | |
6 | * are met: | |
7 | * 1. Redistributions of source code must retain the above copyright | |
8 | * notice, this list of conditions and the following disclaimer. | |
9 | * 2. Redistributions in binary form must reproduce the above copyright | |
10 | * notice, this list of conditions and the following disclaimer in the | |
11 | * documentation and/or other materials provided with the distribution. | |
12 | * | |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
24 | */ | |
25 | ||
26 | // See ES 5.1, 15.10.2.8 | |
27 | function canonicalize(ch) | |
28 | { | |
29 | var u = String.fromCharCode(ch).toUpperCase(); | |
30 | if (u.length > 1) | |
31 | return ch; | |
32 | var cu = u.charCodeAt(0); | |
33 | if (ch >= 128 && cu < 128) | |
34 | return ch; | |
35 | return cu; | |
36 | } | |
37 | ||
38 | var MAX_UCS2 = 0xFFFF; | |
39 | var MAX_LATIN = 0xFF; | |
40 | ||
41 | var groupedCanonically = []; | |
42 | // Pass 1: populate groupedCanonically - this is mapping from canonicalized | |
43 | // values back to the set of character code that canonicalize to them. | |
44 | for (var i = 0; i <= MAX_UCS2; ++i) { | |
45 | var ch = canonicalize(i); | |
46 | if (!groupedCanonically[ch]) | |
47 | groupedCanonically[ch] = []; | |
48 | groupedCanonically[ch].push(i); | |
49 | } | |
50 | ||
51 | var typeInfo = []; | |
52 | var latinTypeInfo = []; | |
53 | var characterSetInfo = []; | |
54 | // Pass 2: populate typeInfo & characterSetInfo. For every character calculate | |
55 | // a typeInfo value, described by the types above, and a value payload. | |
56 | for (cu in groupedCanonically) { | |
57 | // The set of characters that canonicalize to cu | |
58 | var characters = groupedCanonically[cu]; | |
59 | ||
60 | // If there is only one, it is unique. | |
61 | if (characters.length == 1) { | |
62 | typeInfo[characters[0]] = "CanonicalizeUnique:0"; | |
63 | latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0"; | |
64 | continue; | |
65 | } | |
66 | ||
67 | // Sort the array. | |
68 | characters.sort(function(x,y){return x-y;}); | |
69 | ||
70 | // If there are more than two characters, create an entry in characterSetInfo. | |
71 | if (characters.length > 2) { | |
72 | for (i in characters) | |
73 | typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length; | |
74 | characterSetInfo.push(characters); | |
75 | ||
76 | if (characters[1] <= MAX_LATIN) | |
77 | throw new Error("sets with more than one latin character not supported!"); | |
78 | if (characters[0] <= MAX_LATIN) { | |
79 | for (i in characters) | |
80 | latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0]; | |
81 | latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0"; | |
82 | } else { | |
83 | for (i in characters) | |
84 | latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0"; | |
85 | } | |
86 | ||
87 | continue; | |
88 | } | |
89 | ||
90 | // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner. | |
91 | var lo = characters[0]; | |
92 | var hi = characters[1]; | |
93 | var delta = hi - lo; | |
94 | if (delta == 1) { | |
95 | var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0"; | |
96 | typeInfo[lo] = type; | |
97 | typeInfo[hi] = type; | |
98 | } else { | |
99 | typeInfo[lo] = "CanonicalizeRangeLo:" + delta; | |
100 | typeInfo[hi] = "CanonicalizeRangeHi:" + delta; | |
101 | } | |
102 | ||
103 | if (lo > MAX_LATIN) { | |
104 | latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0"; | |
105 | latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0"; | |
106 | } else if (hi > MAX_LATIN) { | |
107 | latinTypeInfo[lo] = "CanonicalizeLatinSelf:0"; | |
108 | latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo; | |
109 | } else { | |
110 | if (delta != 0x20 || lo & 0x20) | |
111 | throw new Error("pairs of latin characters that don't mask with 0x20 not supported!"); | |
112 | latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0"; | |
113 | latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0"; | |
114 | } | |
115 | } | |
116 | ||
117 | var rangeInfo = []; | |
118 | // Pass 3: coallesce types into ranges. | |
119 | for (var end = 0; end <= MAX_UCS2; ++end) { | |
120 | var begin = end; | |
121 | var type = typeInfo[end]; | |
122 | while (end < MAX_UCS2 && typeInfo[end + 1] == type) | |
123 | ++end; | |
124 | rangeInfo.push({begin:begin, end:end, type:type}); | |
125 | } | |
126 | ||
127 | var latinRangeInfo = []; | |
128 | // Pass 4: coallesce latin-1 types into ranges. | |
129 | for (var end = 0; end <= MAX_UCS2; ++end) { | |
130 | var begin = end; | |
131 | var type = latinTypeInfo[end]; | |
132 | while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type) | |
133 | ++end; | |
134 | latinRangeInfo.push({begin:begin, end:end, type:type}); | |
135 | } | |
136 | ||
137 | ||
138 | // Helper function to convert a number to a fixed width hex representation of a C uint16_t. | |
139 | function hex(x) | |
140 | { | |
141 | var s = Number(x).toString(16); | |
142 | while (s.length < 4) | |
143 | s = 0 + s; | |
144 | return "0x" + s + "u"; | |
145 | } | |
146 | ||
147 | var copyright = ( | |
148 | "/*" + "\n" + | |
149 | " * Copyright (C) 2012 Apple Inc. All rights reserved." + "\n" + | |
150 | " *" + "\n" + | |
151 | " * Redistribution and use in source and binary forms, with or without" + "\n" + | |
152 | " * modification, are permitted provided that the following conditions" + "\n" + | |
153 | " * are met:" + "\n" + | |
154 | " * 1. Redistributions of source code must retain the above copyright" + "\n" + | |
155 | " * notice, this list of conditions and the following disclaimer." + "\n" + | |
156 | " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" + | |
157 | " * notice, this list of conditions and the following disclaimer in the" + "\n" + | |
158 | " * documentation and/or other materials provided with the distribution." + "\n" + | |
159 | " *" + "\n" + | |
160 | " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" + | |
161 | " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" + | |
162 | " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" + | |
163 | " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" + | |
164 | " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" + | |
165 | " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" + | |
166 | " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" + | |
167 | " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" + | |
168 | " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" + | |
169 | " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" + | |
170 | " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" + | |
171 | " */"); | |
172 | ||
173 | print(copyright); | |
174 | print(); | |
175 | print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js"); | |
176 | print(); | |
177 | print('#include "config.h"'); | |
178 | print('#include "YarrCanonicalizeUCS2.h"'); | |
179 | print(); | |
180 | print("namespace JSC { namespace Yarr {"); | |
181 | print(); | |
182 | print("#include <stdint.h>"); | |
183 | print(); | |
184 | ||
185 | for (i in characterSetInfo) { | |
186 | var characters = "" | |
187 | var set = characterSetInfo[i]; | |
188 | for (var j in set) | |
189 | characters += hex(set[j]) + ", "; | |
190 | print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };"); | |
191 | } | |
192 | print(); | |
193 | print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";"); | |
194 | print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {"); | |
195 | for (i in characterSetInfo) | |
196 | print(" ucs2CharacterSet" + i + ","); | |
197 | print("};"); | |
198 | print(); | |
199 | print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";"); | |
200 | print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {"); | |
201 | for (i in rangeInfo) { | |
202 | var info = rangeInfo[i]; | |
203 | var typeAndValue = info.type.split(':'); | |
204 | print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); | |
205 | } | |
206 | print("};"); | |
207 | print(); | |
208 | print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";"); | |
209 | print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {"); | |
210 | for (i in latinRangeInfo) { | |
211 | var info = latinRangeInfo[i]; | |
212 | var typeAndValue = info.type.split(':'); | |
213 | print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); | |
214 | } | |
215 | print("};"); | |
216 | print(); | |
217 | print("} } // JSC::Yarr"); | |
218 | print(); | |
219 |