]> git.saurik.com Git - apple/javascriptcore.git/blob - yarr/YarrCanonicalizeUCS2.js
JavaScriptCore-1097.3.3.tar.gz
[apple/javascriptcore.git] / yarr / YarrCanonicalizeUCS2.js
1 /*
2 * Copyright (C) 2012 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 // See ES 5.1, 15.10.2.8
27 function canonicalize(ch)
28 {
29 var u = String.fromCharCode(ch).toUpperCase();
30 if (u.length > 1)
31 return ch;
32 var cu = u.charCodeAt(0);
33 if (ch >= 128 && cu < 128)
34 return ch;
35 return cu;
36 }
37
38 var MAX_UCS2 = 0xFFFF;
39 var MAX_LATIN = 0xFF;
40
41 var groupedCanonically = [];
42 // Pass 1: populate groupedCanonically - this is mapping from canonicalized
43 // values back to the set of character code that canonicalize to them.
44 for (var i = 0; i <= MAX_UCS2; ++i) {
45 var ch = canonicalize(i);
46 if (!groupedCanonically[ch])
47 groupedCanonically[ch] = [];
48 groupedCanonically[ch].push(i);
49 }
50
51 var typeInfo = [];
52 var latinTypeInfo = [];
53 var characterSetInfo = [];
54 // Pass 2: populate typeInfo & characterSetInfo. For every character calculate
55 // a typeInfo value, described by the types above, and a value payload.
56 for (cu in groupedCanonically) {
57 // The set of characters that canonicalize to cu
58 var characters = groupedCanonically[cu];
59
60 // If there is only one, it is unique.
61 if (characters.length == 1) {
62 typeInfo[characters[0]] = "CanonicalizeUnique:0";
63 latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0";
64 continue;
65 }
66
67 // Sort the array.
68 characters.sort(function(x,y){return x-y;});
69
70 // If there are more than two characters, create an entry in characterSetInfo.
71 if (characters.length > 2) {
72 for (i in characters)
73 typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length;
74 characterSetInfo.push(characters);
75
76 if (characters[1] <= MAX_LATIN)
77 throw new Error("sets with more than one latin character not supported!");
78 if (characters[0] <= MAX_LATIN) {
79 for (i in characters)
80 latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0];
81 latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0";
82 } else {
83 for (i in characters)
84 latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0";
85 }
86
87 continue;
88 }
89
90 // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner.
91 var lo = characters[0];
92 var hi = characters[1];
93 var delta = hi - lo;
94 if (delta == 1) {
95 var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0";
96 typeInfo[lo] = type;
97 typeInfo[hi] = type;
98 } else {
99 typeInfo[lo] = "CanonicalizeRangeLo:" + delta;
100 typeInfo[hi] = "CanonicalizeRangeHi:" + delta;
101 }
102
103 if (lo > MAX_LATIN) {
104 latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0";
105 latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0";
106 } else if (hi > MAX_LATIN) {
107 latinTypeInfo[lo] = "CanonicalizeLatinSelf:0";
108 latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo;
109 } else {
110 if (delta != 0x20 || lo & 0x20)
111 throw new Error("pairs of latin characters that don't mask with 0x20 not supported!");
112 latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0";
113 latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0";
114 }
115 }
116
117 var rangeInfo = [];
118 // Pass 3: coallesce types into ranges.
119 for (var end = 0; end <= MAX_UCS2; ++end) {
120 var begin = end;
121 var type = typeInfo[end];
122 while (end < MAX_UCS2 && typeInfo[end + 1] == type)
123 ++end;
124 rangeInfo.push({begin:begin, end:end, type:type});
125 }
126
127 var latinRangeInfo = [];
128 // Pass 4: coallesce latin-1 types into ranges.
129 for (var end = 0; end <= MAX_UCS2; ++end) {
130 var begin = end;
131 var type = latinTypeInfo[end];
132 while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type)
133 ++end;
134 latinRangeInfo.push({begin:begin, end:end, type:type});
135 }
136
137
138 // Helper function to convert a number to a fixed width hex representation of a C uint16_t.
139 function hex(x)
140 {
141 var s = Number(x).toString(16);
142 while (s.length < 4)
143 s = 0 + s;
144 return "0x" + s + "u";
145 }
146
147 var copyright = (
148 "/*" + "\n" +
149 " * Copyright (C) 2012 Apple Inc. All rights reserved." + "\n" +
150 " *" + "\n" +
151 " * Redistribution and use in source and binary forms, with or without" + "\n" +
152 " * modification, are permitted provided that the following conditions" + "\n" +
153 " * are met:" + "\n" +
154 " * 1. Redistributions of source code must retain the above copyright" + "\n" +
155 " * notice, this list of conditions and the following disclaimer." + "\n" +
156 " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" +
157 " * notice, this list of conditions and the following disclaimer in the" + "\n" +
158 " * documentation and/or other materials provided with the distribution." + "\n" +
159 " *" + "\n" +
160 " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" +
161 " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" +
162 " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" +
163 " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" +
164 " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" +
165 " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" +
166 " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" +
167 " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" +
168 " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" +
169 " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" +
170 " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" +
171 " */");
172
173 print(copyright);
174 print();
175 print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js");
176 print();
177 print('#include "config.h"');
178 print('#include "YarrCanonicalizeUCS2.h"');
179 print();
180 print("namespace JSC { namespace Yarr {");
181 print();
182 print("#include <stdint.h>");
183 print();
184
185 for (i in characterSetInfo) {
186 var characters = ""
187 var set = characterSetInfo[i];
188 for (var j in set)
189 characters += hex(set[j]) + ", ";
190 print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };");
191 }
192 print();
193 print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";");
194 print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {");
195 for (i in characterSetInfo)
196 print(" ucs2CharacterSet" + i + ",");
197 print("};");
198 print();
199 print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";");
200 print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {");
201 for (i in rangeInfo) {
202 var info = rangeInfo[i];
203 var typeAndValue = info.type.split(':');
204 print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
205 }
206 print("};");
207 print();
208 print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";");
209 print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {");
210 for (i in latinRangeInfo) {
211 var info = latinRangeInfo[i];
212 var typeAndValue = info.type.split(':');
213 print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
214 }
215 print("};");
216 print();
217 print("} } // JSC::Yarr");
218 print();
219