]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | ||
4 | #include <stdio.h> | |
5 | #include <string> | |
6 | #include <stdlib.h> | |
f3c0d7a5 A |
7 | #include <errno.h> |
8 | #include <string.h> | |
9 | #include <iostream> | |
10 | #include <fstream> | |
11 | ||
0f5d89e8 | 12 | // We only use U8_* macros, which are entirely inline. |
f3c0d7a5 A |
13 | #include "unicode/utf8.h" |
14 | ||
0f5d89e8 A |
15 | // This contains a codepage and ISO 14882:1998 illegality table. |
16 | // Use "make gen-table" to rebuild it. | |
17 | #include "cptbl.h" | |
18 | ||
19 | /** | |
20 | * What is this? | |
21 | * | |
22 | * "This" is a preprocessor that makes an attempt to convert fully valid C++11 source code | |
23 | * in utf-8 into something consumable by certain compilers (Solaris, xlC) | |
24 | * which aren't quite standards compliant. | |
25 | * | |
26 | * - u"<unicode>" or u'<unicode>' gets converted to u"\uNNNN" or u'\uNNNN' | |
27 | * - u8"<unicode>" gets converted to "\xAA\xBB\xCC\xDD" etc. | |
28 | * (some compilers do not support the u8 prefix correctly.) | |
29 | * - if the system is EBCDIC-based, that is used to correct the input characters. | |
30 | * | |
31 | * Usage: | |
32 | * escapesrc infile.cpp outfile.cpp | |
33 | * Normally this is invoked by the build stage, with a rule such as: | |
34 | * | |
35 | * _%.cpp: $(srcdir)/%.cpp | |
36 | * @$(BINDIR)/escapesrc$(EXEEXT) $< $@ | |
37 | * %.o: _%.cpp | |
38 | * $(COMPILE.cc) ... $@ $< | |
39 | * | |
40 | * In the Makefiles, SKIP_ESCAPING=YES is used to prevent escapesrc.cpp | |
41 | * from being itself escaped. | |
42 | */ | |
43 | ||
44 | ||
f3c0d7a5 A |
45 | static const char |
46 | kSPACE = 0x20, | |
47 | kTAB = 0x09, | |
48 | kLF = 0x0A, | |
0f5d89e8 | 49 | kCR = 0x0D; |
f3c0d7a5 | 50 | |
0f5d89e8 | 51 | // For convenience |
f3c0d7a5 A |
52 | # define cp1047_to_8859(c) cp1047_8859_1[c] |
53 | ||
0f5d89e8 | 54 | // Our app's name |
f3c0d7a5 A |
55 | std::string prog; |
56 | ||
0f5d89e8 A |
57 | /** |
58 | * Give the usual 1-line documentation and exit | |
59 | */ | |
f3c0d7a5 A |
60 | void usage() { |
61 | fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str()); | |
62 | } | |
63 | ||
0f5d89e8 A |
64 | /** |
65 | * Delete the output file (if any) | |
66 | * We want to delete even if we didn't generate, because it might be stale. | |
67 | */ | |
f3c0d7a5 A |
68 | int cleanup(const std::string &outfile) { |
69 | const char *outstr = outfile.c_str(); | |
70 | if(outstr && *outstr) { | |
0f5d89e8 | 71 | int rc = std::remove(outstr); |
f3c0d7a5 A |
72 | if(rc == 0) { |
73 | fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr); | |
74 | return 0; | |
75 | } else { | |
76 | if( errno == ENOENT ) { | |
77 | return 0; // File did not exist - no error. | |
78 | } else { | |
0f5d89e8 | 79 | perror("std::remove"); |
f3c0d7a5 A |
80 | return 1; |
81 | } | |
82 | } | |
83 | } | |
84 | return 0; | |
85 | } | |
86 | ||
0f5d89e8 A |
87 | /** |
88 | * Skip across any known whitespace. | |
89 | * @param p startpoint | |
90 | * @param e limit | |
91 | * @return first non-whitespace char | |
92 | */ | |
f3c0d7a5 A |
93 | inline const char *skipws(const char *p, const char *e) { |
94 | for(;p<e;p++) { | |
95 | switch(*p) { | |
96 | case kSPACE: | |
97 | case kTAB: | |
98 | case kLF: | |
99 | case kCR: | |
100 | break; | |
101 | default: | |
102 | return p; // non ws | |
103 | } | |
104 | } | |
105 | return p; | |
106 | } | |
107 | ||
0f5d89e8 A |
108 | /** |
109 | * Append a byte, hex encoded | |
110 | * @param outstr sstring to append to | |
111 | * @param byte the byte to append | |
112 | */ | |
f3c0d7a5 A |
113 | void appendByte(std::string &outstr, |
114 | uint8_t byte) { | |
115 | char tmp2[5]; | |
116 | sprintf(tmp2, "\\x%02X", 0xFF & (int)(byte)); | |
117 | outstr += tmp2; | |
118 | } | |
119 | ||
120 | /** | |
0f5d89e8 A |
121 | * Append the bytes from 'linestr' into outstr, with escaping |
122 | * @param outstr the output buffer | |
123 | * @param linestr the input buffer | |
124 | * @param pos in/out: the current char under consideration | |
125 | * @param chars the number of chars to consider | |
f3c0d7a5 A |
126 | * @return true on failure |
127 | */ | |
128 | bool appendUtf8(std::string &outstr, | |
129 | const std::string &linestr, | |
130 | size_t &pos, | |
131 | size_t chars) { | |
132 | char tmp[9]; | |
133 | for(size_t i=0;i<chars;i++) { | |
134 | tmp[i] = linestr[++pos]; | |
135 | } | |
136 | tmp[chars] = 0; | |
137 | unsigned int c; | |
138 | sscanf(tmp, "%X", &c); | |
139 | UChar32 ch = c & 0x1FFFFF; | |
140 | ||
141 | // now to append \\x%% etc | |
142 | uint8_t bytesNeeded = U8_LENGTH(ch); | |
143 | if(bytesNeeded == 0) { | |
144 | fprintf(stderr, "Illegal code point U+%X\n", ch); | |
145 | return true; | |
146 | } | |
147 | uint8_t bytes[4]; | |
148 | uint8_t *s = bytes; | |
149 | size_t i = 0; | |
150 | U8_APPEND_UNSAFE(s, i, ch); | |
151 | for(size_t t = 0; t<i; t++) { | |
152 | appendByte(outstr, s[t]); | |
153 | } | |
154 | return false; | |
155 | } | |
156 | ||
157 | /** | |
0f5d89e8 | 158 | * Fixup u8"x" |
f3c0d7a5 A |
159 | * @param linestr string to mutate. Already escaped into \u format. |
160 | * @param origpos beginning, points to 'u8"' | |
161 | * @param pos end, points to " | |
162 | * @return false for no-problem, true for failure! | |
163 | */ | |
164 | bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) { | |
165 | size_t pos = origpos + 3; | |
166 | std::string outstr; | |
167 | outstr += '\"'; // local encoding | |
168 | for(;pos<endpos;pos++) { | |
169 | char c = linestr[pos]; | |
170 | if(c == '\\') { | |
171 | char c2 = linestr[++pos]; | |
172 | switch(c2) { | |
173 | case '\'': | |
174 | case '"': | |
175 | #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) | |
176 | c2 = cp1047_to_8859(c2); | |
177 | #endif | |
178 | appendByte(outstr, c2); | |
179 | break; | |
180 | case 'u': | |
181 | appendUtf8(outstr, linestr, pos, 4); | |
182 | break; | |
183 | case 'U': | |
184 | appendUtf8(outstr, linestr, pos, 8); | |
185 | break; | |
186 | } | |
187 | } else { | |
188 | #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) | |
189 | c = cp1047_to_8859(c); | |
190 | #endif | |
191 | appendByte(outstr, c); | |
192 | } | |
193 | } | |
194 | outstr += ('\"'); | |
195 | ||
196 | linestr.replace(origpos, (endpos-origpos+1), outstr); | |
197 | ||
198 | return false; // OK | |
199 | } | |
200 | ||
201 | /** | |
0f5d89e8 A |
202 | * fix the u"x"/u'x'/u8"x" string at the position |
203 | * u8'x' is not supported, sorry. | |
204 | * @param linestr the input string | |
205 | * @param pos the position | |
206 | * @return false = no err, true = had err | |
f3c0d7a5 A |
207 | */ |
208 | bool fixAt(std::string &linestr, size_t pos) { | |
209 | size_t origpos = pos; | |
210 | ||
211 | if(linestr[pos] != 'u') { | |
212 | fprintf(stderr, "Not a 'u'?"); | |
213 | return true; | |
214 | } | |
215 | ||
216 | pos++; // past 'u' | |
217 | ||
218 | bool utf8 = false; | |
219 | ||
220 | if(linestr[pos] == '8') { // u8" | |
221 | utf8 = true; | |
222 | pos++; | |
223 | } | |
224 | ||
225 | char quote = linestr[pos]; | |
226 | ||
227 | if(quote != '\'' && quote != '\"') { | |
228 | fprintf(stderr, "Quote is '%c' - not sure what to do.\n", quote); | |
229 | return true; | |
230 | } | |
231 | ||
232 | if(quote == '\'' && utf8) { | |
233 | fprintf(stderr, "Cannot do u8'...'\n"); | |
234 | return true; | |
235 | } | |
236 | ||
237 | pos ++; | |
238 | ||
239 | //printf("u%c…%c\n", quote, quote); | |
240 | ||
241 | for(; pos < linestr.size(); pos++) { | |
242 | if(linestr[pos] == quote) { | |
243 | if(utf8) { | |
244 | return fixu8(linestr, origpos, pos); // fix u8"..." | |
245 | } else { | |
246 | return false; // end of quote | |
247 | } | |
248 | } | |
249 | if(linestr[pos] == '\\') { | |
250 | pos++; | |
251 | if(linestr[pos] == quote) continue; // quoted quote | |
252 | if(linestr[pos] == 'u') continue; // for now ... unicode escape | |
253 | if(linestr[pos] == '\\') continue; | |
254 | // some other escape… ignore | |
255 | } else { | |
256 | size_t old_pos = pos; | |
257 | int32_t i = pos; | |
258 | #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) | |
259 | // mogrify 1-4 bytes from 1047 'back' to utf-8 | |
260 | char old_byte = linestr[pos]; | |
261 | linestr[pos] = cp1047_to_8859(linestr[pos]); | |
262 | // how many more? | |
263 | int32_t trail = U8_COUNT_TRAIL_BYTES(linestr[pos]); | |
264 | for(size_t pos2 = pos+1; trail>0; pos2++,trail--) { | |
265 | linestr[pos2] = cp1047_to_8859(linestr[pos2]); | |
266 | if(linestr[pos2] == 0x0A) { | |
267 | linestr[pos2] = 0x85; // NL is ambiguous here | |
268 | } | |
269 | } | |
270 | #endif | |
271 | ||
272 | // Proceed to decode utf-8 | |
273 | const uint8_t *s = (const uint8_t*) (linestr.c_str()); | |
274 | int32_t length = linestr.size(); | |
275 | UChar32 c; | |
276 | if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) { | |
277 | #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) | |
278 | linestr[pos] = old_byte; // put it back | |
279 | #endif | |
280 | continue; // single code point not previously legal for \u escaping | |
281 | } | |
282 | ||
283 | // otherwise, convert it to \u / \U | |
284 | { | |
285 | U8_NEXT(s, i, length, c); | |
286 | } | |
287 | if(c<0) { | |
0f5d89e8 | 288 | fprintf(stderr, "Illegal utf-8 sequence at Column: %d\n", (int)old_pos); |
f3c0d7a5 A |
289 | fprintf(stderr, "Line: >>%s<<\n", linestr.c_str()); |
290 | return true; | |
291 | } | |
292 | ||
293 | size_t seqLen = (i-pos); | |
294 | ||
295 | //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);fflush(stdout); | |
296 | ||
297 | char newSeq[20]; | |
298 | if( c <= 0xFFFF) { | |
299 | sprintf(newSeq, "\\u%04X", c); | |
300 | } else { | |
301 | sprintf(newSeq, "\\U%08X", c); | |
302 | } | |
303 | linestr.replace(pos, seqLen, newSeq); | |
304 | pos += strlen(newSeq) - 1; | |
305 | } | |
306 | } | |
307 | ||
308 | return false; | |
309 | } | |
310 | ||
311 | /** | |
0f5d89e8 | 312 | * Fixup an entire line |
f3c0d7a5 A |
313 | * false = no err |
314 | * true = had err | |
0f5d89e8 A |
315 | * @param no the line number (not used) |
316 | * @param linestr the string to fix | |
317 | * @return true if any err, else false | |
f3c0d7a5 A |
318 | */ |
319 | bool fixLine(int /*no*/, std::string &linestr) { | |
320 | const char *line = linestr.c_str(); | |
321 | size_t len = linestr.size(); | |
322 | ||
323 | // no u' in the line? | |
324 | if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) { | |
325 | return false; // Nothing to do. No u' or u" detected | |
326 | } | |
327 | ||
f3c0d7a5 A |
328 | // start from the end and find all u" cases |
329 | size_t pos = len = linestr.size(); | |
3d1f044b A |
330 | if(len>INT32_MAX/2) { |
331 | return true; | |
332 | } | |
f3c0d7a5 A |
333 | while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) { |
334 | //printf("found doublequote at %d\n", pos); | |
335 | if(fixAt(linestr, pos)) return true; | |
336 | if(pos == 0) break; | |
337 | pos--; | |
338 | } | |
339 | ||
340 | // reset and find all u' cases | |
341 | pos = len = linestr.size(); | |
342 | while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) { | |
343 | //printf("found singlequote at %d\n", pos); | |
344 | if(fixAt(linestr, pos)) return true; | |
345 | if(pos == 0) break; | |
346 | pos--; | |
347 | } | |
348 | ||
349 | // reset and find all u8" cases | |
350 | pos = len = linestr.size(); | |
351 | while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) { | |
352 | if(fixAt(linestr, pos)) return true; | |
353 | if(pos == 0) break; | |
354 | pos--; | |
355 | } | |
356 | ||
357 | //fprintf(stderr, "%d - fixed\n", no); | |
358 | return false; | |
359 | } | |
360 | ||
0f5d89e8 A |
361 | /** |
362 | * Convert a whole file | |
363 | * @param infile | |
364 | * @param outfile | |
365 | * @return 1 on err, 0 otherwise | |
366 | */ | |
f3c0d7a5 A |
367 | int convert(const std::string &infile, const std::string &outfile) { |
368 | fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str()); | |
369 | ||
370 | std::ifstream inf; | |
371 | ||
372 | inf.open(infile.c_str(), std::ios::in); | |
373 | ||
374 | if(!inf.is_open()) { | |
375 | fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str()); | |
376 | cleanup(outfile); | |
377 | return 1; | |
378 | } | |
379 | ||
380 | std::ofstream outf; | |
381 | ||
382 | outf.open(outfile.c_str(), std::ios::out); | |
383 | ||
384 | if(!outf.is_open()) { | |
385 | fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str()); | |
386 | return 1; | |
387 | } | |
388 | ||
389 | // TODO: any platform variations of #line? | |
390 | outf << "#line 1 \"" << infile << "\"" << '\n'; | |
391 | ||
392 | int no = 0; | |
393 | std::string linestr; | |
394 | while( getline( inf, linestr)) { | |
395 | no++; | |
396 | if(fixLine(no, linestr)) { | |
3d1f044b | 397 | goto fail; |
f3c0d7a5 A |
398 | } |
399 | outf << linestr << '\n'; | |
400 | } | |
401 | ||
3d1f044b A |
402 | if(inf.eof()) { |
403 | return 0; | |
404 | } | |
405 | fail: | |
406 | outf.close(); | |
407 | fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str()); | |
408 | cleanup(outfile); | |
409 | return 1; | |
f3c0d7a5 A |
410 | } |
411 | ||
0f5d89e8 A |
412 | /** |
413 | * Main function | |
414 | */ | |
f3c0d7a5 A |
415 | int main(int argc, const char *argv[]) { |
416 | prog = argv[0]; | |
417 | ||
418 | if(argc != 3) { | |
419 | usage(); | |
420 | return 1; | |
421 | } | |
422 | ||
423 | std::string infile = argv[1]; | |
424 | std::string outfile = argv[2]; | |
425 | ||
426 | return convert(infile, outfile); | |
427 | } |