]>
git.saurik.com Git - apple/icu.git/blob - icuSources/tools/escapesrc/escapesrc.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
12 // We only use U8_* macros, which are entirely inline.
13 #include "unicode/utf8.h"
15 // This contains a codepage and ISO 14882:1998 illegality table.
16 // Use "make gen-table" to rebuild it.
22 * "This" is a preprocessor that makes an attempt to convert fully valid C++11 source code
23 * in utf-8 into something consumable by certain compilers (Solaris, xlC)
24 * which aren't quite standards compliant.
26 * - u"<unicode>" or u'<unicode>' gets converted to u"\uNNNN" or u'\uNNNN'
27 * - u8"<unicode>" gets converted to "\xAA\xBB\xCC\xDD" etc.
28 * (some compilers do not support the u8 prefix correctly.)
29 * - if the system is EBCDIC-based, that is used to correct the input characters.
32 * escapesrc infile.cpp outfile.cpp
33 * Normally this is invoked by the build stage, with a rule such as:
35 * _%.cpp: $(srcdir)/%.cpp
36 * @$(BINDIR)/escapesrc$(EXEEXT) $< $@
38 * $(COMPILE.cc) ... $@ $<
40 * In the Makefiles, SKIP_ESCAPING=YES is used to prevent escapesrc.cpp
41 * from being itself escaped.
52 # define cp1047_to_8859(c) cp1047_8859_1[c]
58 * Give the usual 1-line documentation and exit
61 fprintf(stderr
, "%s: usage: %s infile.cpp outfile.cpp\n", prog
.c_str(), prog
.c_str());
65 * Delete the output file (if any)
66 * We want to delete even if we didn't generate, because it might be stale.
68 int cleanup(const std::string
&outfile
) {
69 const char *outstr
= outfile
.c_str();
70 if(outstr
&& *outstr
) {
71 int rc
= std::remove(outstr
);
73 fprintf(stderr
, "%s: deleted %s\n", prog
.c_str(), outstr
);
76 if( errno
== ENOENT
) {
77 return 0; // File did not exist - no error.
79 perror("std::remove");
88 * Skip across any known whitespace.
91 * @return first non-whitespace char
93 inline const char *skipws(const char *p
, const char *e
) {
109 * Append a byte, hex encoded
110 * @param outstr sstring to append to
111 * @param byte the byte to append
113 void appendByte(std::string
&outstr
,
116 sprintf(tmp2
, "\\x%02X", 0xFF & (int)(byte
));
121 * Append the bytes from 'linestr' into outstr, with escaping
122 * @param outstr the output buffer
123 * @param linestr the input buffer
124 * @param pos in/out: the current char under consideration
125 * @param chars the number of chars to consider
126 * @return true on failure
128 bool appendUtf8(std::string
&outstr
,
129 const std::string
&linestr
,
133 for(size_t i
=0;i
<chars
;i
++) {
134 tmp
[i
] = linestr
[++pos
];
138 sscanf(tmp
, "%X", &c
);
139 UChar32 ch
= c
& 0x1FFFFF;
141 // now to append \\x%% etc
142 uint8_t bytesNeeded
= U8_LENGTH(ch
);
143 if(bytesNeeded
== 0) {
144 fprintf(stderr
, "Illegal code point U+%X\n", ch
);
150 U8_APPEND_UNSAFE(s
, i
, ch
);
151 for(size_t t
= 0; t
<i
; t
++) {
152 appendByte(outstr
, s
[t
]);
159 * @param linestr string to mutate. Already escaped into \u format.
160 * @param origpos beginning, points to 'u8"'
161 * @param pos end, points to "
162 * @return false for no-problem, true for failure!
164 bool fixu8(std::string
&linestr
, size_t origpos
, size_t &endpos
) {
165 size_t pos
= origpos
+ 3;
167 outstr
+= '\"'; // local encoding
168 for(;pos
<endpos
;pos
++) {
169 char c
= linestr
[pos
];
171 char c2
= linestr
[++pos
];
175 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
176 c2
= cp1047_to_8859(c2
);
178 appendByte(outstr
, c2
);
181 appendUtf8(outstr
, linestr
, pos
, 4);
184 appendUtf8(outstr
, linestr
, pos
, 8);
188 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
189 c
= cp1047_to_8859(c
);
191 appendByte(outstr
, c
);
196 linestr
.replace(origpos
, (endpos
-origpos
+1), outstr
);
202 * fix the u"x"/u'x'/u8"x" string at the position
203 * u8'x' is not supported, sorry.
204 * @param linestr the input string
205 * @param pos the position
206 * @return false = no err, true = had err
208 bool fixAt(std::string
&linestr
, size_t pos
) {
209 size_t origpos
= pos
;
211 if(linestr
[pos
] != 'u') {
212 fprintf(stderr
, "Not a 'u'?");
220 if(linestr
[pos
] == '8') { // u8"
225 char quote
= linestr
[pos
];
227 if(quote
!= '\'' && quote
!= '\"') {
228 fprintf(stderr
, "Quote is '%c' - not sure what to do.\n", quote
);
232 if(quote
== '\'' && utf8
) {
233 fprintf(stderr
, "Cannot do u8'...'\n");
239 //printf("u%c…%c\n", quote, quote);
241 for(; pos
< linestr
.size(); pos
++) {
242 if(linestr
[pos
] == quote
) {
244 return fixu8(linestr
, origpos
, pos
); // fix u8"..."
246 return false; // end of quote
249 if(linestr
[pos
] == '\\') {
251 if(linestr
[pos
] == quote
) continue; // quoted quote
252 if(linestr
[pos
] == 'u') continue; // for now ... unicode escape
253 if(linestr
[pos
] == '\\') continue;
254 // some other escape… ignore
256 size_t old_pos
= pos
;
258 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
259 // mogrify 1-4 bytes from 1047 'back' to utf-8
260 char old_byte
= linestr
[pos
];
261 linestr
[pos
] = cp1047_to_8859(linestr
[pos
]);
263 int32_t trail
= U8_COUNT_TRAIL_BYTES(linestr
[pos
]);
264 for(size_t pos2
= pos
+1; trail
>0; pos2
++,trail
--) {
265 linestr
[pos2
] = cp1047_to_8859(linestr
[pos2
]);
266 if(linestr
[pos2
] == 0x0A) {
267 linestr
[pos2
] = 0x85; // NL is ambiguous here
272 // Proceed to decode utf-8
273 const uint8_t *s
= (const uint8_t*) (linestr
.c_str());
274 int32_t length
= linestr
.size();
276 if(U8_IS_SINGLE((uint8_t)s
[i
]) && oldIllegal
[s
[i
]]) {
277 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
278 linestr
[pos
] = old_byte
; // put it back
280 continue; // single code point not previously legal for \u escaping
283 // otherwise, convert it to \u / \U
285 U8_NEXT(s
, i
, length
, c
);
288 fprintf(stderr
, "Illegal utf-8 sequence at Column: %d\n", (int)old_pos
);
289 fprintf(stderr
, "Line: >>%s<<\n", linestr
.c_str());
293 size_t seqLen
= (i
-pos
);
295 //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);fflush(stdout);
299 sprintf(newSeq
, "\\u%04X", c
);
301 sprintf(newSeq
, "\\U%08X", c
);
303 linestr
.replace(pos
, seqLen
, newSeq
);
304 pos
+= strlen(newSeq
) - 1;
312 * Fixup an entire line
315 * @param no the line number (not used)
316 * @param linestr the string to fix
317 * @return true if any err, else false
319 bool fixLine(int /*no*/, std::string
&linestr
) {
320 const char *line
= linestr
.c_str();
321 size_t len
= linestr
.size();
323 // no u' in the line?
324 if(!strstr(line
, "u'") && !strstr(line
, "u\"") && !strstr(line
, "u8\"")) {
325 return false; // Nothing to do. No u' or u" detected
328 // start from the end and find all u" cases
329 size_t pos
= len
= linestr
.size();
330 while((pos
>0) && (pos
= linestr
.rfind("u\"", pos
)) != std::string::npos
) {
331 //printf("found doublequote at %d\n", pos);
332 if(fixAt(linestr
, pos
)) return true;
337 // reset and find all u' cases
338 pos
= len
= linestr
.size();
339 while((pos
>0) && (pos
= linestr
.rfind("u'", pos
)) != std::string::npos
) {
340 //printf("found singlequote at %d\n", pos);
341 if(fixAt(linestr
, pos
)) return true;
346 // reset and find all u8" cases
347 pos
= len
= linestr
.size();
348 while((pos
>0) && (pos
= linestr
.rfind("u8\"", pos
)) != std::string::npos
) {
349 if(fixAt(linestr
, pos
)) return true;
354 //fprintf(stderr, "%d - fixed\n", no);
359 * Convert a whole file
362 * @return 1 on err, 0 otherwise
364 int convert(const std::string
&infile
, const std::string
&outfile
) {
365 fprintf(stderr
, "escapesrc: %s -> %s\n", infile
.c_str(), outfile
.c_str());
369 inf
.open(infile
.c_str(), std::ios::in
);
372 fprintf(stderr
, "%s: could not open input file %s\n", prog
.c_str(), infile
.c_str());
379 outf
.open(outfile
.c_str(), std::ios::out
);
381 if(!outf
.is_open()) {
382 fprintf(stderr
, "%s: could not open output file %s\n", prog
.c_str(), outfile
.c_str());
386 // TODO: any platform variations of #line?
387 outf
<< "#line 1 \"" << infile
<< "\"" << '\n';
391 while( getline( inf
, linestr
)) {
393 if(fixLine(no
, linestr
)) {
395 fprintf(stderr
, "%s:%d: Fixup failed by %s\n", infile
.c_str(), no
, prog
.c_str());
399 outf
<< linestr
<< '\n';
408 int main(int argc
, const char *argv
[]) {
416 std::string infile
= argv
[1];
417 std::string outfile
= argv
[2];
419 return convert(infile
, outfile
);