icuSources/tools/escapesrc/escapesrc.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3
   4 #include <stdio.h>
   5 #include <string>
   6 #include <stdlib.h>
   7 #include <errno.h>
   8 #include <string.h>
   9 #include <iostream>
  10 #include <fstream>
  11
  12 // We only use U8_* macros, which are entirely inline.
  13 #include "unicode/utf8.h"
  14
  15 // This contains a codepage and ISO 14882:1998 illegality table.
  16 // Use "make gen-table" to rebuild it.
  17 #include "cptbl.h"
  18
  19 /**
  20  * What is this?
  21  *
  22  * "This" is a preprocessor that makes an attempt to convert fully valid C++11 source code
  23  * in utf-8 into something consumable by certain compilers (Solaris, xlC)
  24  * which aren't quite standards compliant.
  25  *
  26  * - u"<unicode>" or u'<unicode>' gets converted to u"\uNNNN" or u'\uNNNN'
  27  * - u8"<unicode>" gets converted to "\xAA\xBB\xCC\xDD" etc.
  28  *   (some compilers do not support the u8 prefix correctly.)
  29  * - if the system is EBCDIC-based, that is used to correct the input characters.
  30  *
  31  * Usage:
  32  *   escapesrc infile.cpp outfile.cpp
  33  * Normally this is invoked by the build stage, with a rule such as:
  34  *
  35  * _%.cpp: $(srcdir)/%.cpp
  36  *       @$(BINDIR)/escapesrc$(EXEEXT) $< $@
  37  * %.o: _%.cpp
  38  *       $(COMPILE.cc) ... $@ $<
  39  *
  40  * In the Makefiles, SKIP_ESCAPING=YES is used to prevent escapesrc.cpp
  41  * from being itself escaped.
  42  */
  43
  44
  45 static const char
  46   kSPACE   = 0x20,
  47   kTAB     = 0x09,
  48   kLF      = 0x0A,
  49   kCR      = 0x0D;
  50
  51 // For convenience
  52 # define cp1047_to_8859(c) cp1047_8859_1[c]
  53
  54 // Our app's name
  55 std::string prog;
  56
  57 /**
  58  * Give the usual 1-line documentation and exit
  59  */
  60 void usage() {
  61   fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str());
  62 }
  63
  64 /**
  65  * Delete the output file (if any)
  66  * We want to delete even if we didn't generate, because it might be stale.
  67  */
  68 int cleanup(const std::string &outfile) {
  69   const char *outstr = outfile.c_str();
  70   if(outstr && *outstr) {
  71     int rc = std::remove(outstr);
  72     if(rc == 0) {
  73       fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr);
  74       return 0;
  75     } else {
  76       if( errno == ENOENT ) {
  77         return 0; // File did not exist - no error.
  78       } else {
  79         perror("std::remove");
  80         return 1;
  81       }
  82     }
  83   }
  84   return 0;
  85 }
  86
  87 /**
  88  * Skip across any known whitespace.
  89  * @param p startpoint
  90  * @param e limit
  91  * @return first non-whitespace char
  92  */
  93 inline const char *skipws(const char *p, const char *e) {
  94   for(;p<e;p++) {
  95     switch(*p) {
  96     case kSPACE:
  97     case kTAB:
  98     case kLF:
  99     case kCR:
 100       break;
 101     default:
 102       return p; // non ws
 103     }
 104   }
 105   return p;
 106 }
 107
 108 /**
 109  * Append a byte, hex encoded
 110  * @param outstr sstring to append to
 111  * @param byte the byte to append
 112  */
 113 void appendByte(std::string &outstr,
 114                 uint8_t byte) {
 115     char tmp2[5];
 116     sprintf(tmp2, "\\x%02X", 0xFF & (int)(byte));
 117     outstr += tmp2;
 118 }
 119
 120 /**
 121  * Append the bytes from 'linestr' into outstr, with escaping
 122  * @param outstr the output buffer
 123  * @param linestr the input buffer
 124  * @param pos in/out: the current char under consideration
 125  * @param chars the number of chars to consider
 126  * @return true on failure
 127  */
 128 bool appendUtf8(std::string &outstr,
 129                 const std::string &linestr,
 130                 size_t &pos,
 131                 size_t chars) {
 132   char tmp[9];
 133   for(size_t i=0;i<chars;i++) {
 134     tmp[i] = linestr[++pos];
 135   }
 136   tmp[chars] = 0;
 137   unsigned int c;
 138   sscanf(tmp, "%X", &c);
 139   UChar32 ch = c & 0x1FFFFF;
 140
 141   // now to append \\x%% etc
 142   uint8_t bytesNeeded = U8_LENGTH(ch);
 143   if(bytesNeeded == 0) {
 144     fprintf(stderr, "Illegal code point U+%X\n", ch);
 145     return true;
 146   }
 147   uint8_t bytes[4];
 148   uint8_t *s = bytes;
 149   size_t i = 0;
 150   U8_APPEND_UNSAFE(s, i, ch);
 151   for(size_t t = 0; t<i; t++) {
 152     appendByte(outstr, s[t]);
 153   }
 154   return false;
 155 }
 156
 157 /**
 158  * Fixup u8"x"
 159  * @param linestr string to mutate. Already escaped into \u format.
 160  * @param origpos beginning, points to 'u8"'
 161  * @param pos end, points to "
 162  * @return false for no-problem, true for failure!
 163  */
 164 bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) {
 165   size_t pos = origpos + 3;
 166   std::string outstr;
 167   outstr += '\"'; // local encoding
 168   for(;pos<endpos;pos++) {
 169     char c = linestr[pos];
 170     if(c == '\\') {
 171       char c2 = linestr[++pos];
 172       switch(c2) {
 173       case '\'':
 174       case '"':
 175 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
 176         c2 = cp1047_to_8859(c2);
 177 #endif
 178         appendByte(outstr, c2);
 179         break;
 180       case 'u':
 181         appendUtf8(outstr, linestr, pos, 4);
 182         break;
 183       case 'U':
 184         appendUtf8(outstr, linestr, pos, 8);
 185         break;
 186       }
 187     } else {
 188 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
 189       c = cp1047_to_8859(c);
 190 #endif
 191       appendByte(outstr, c);
 192     }
 193   }
 194   outstr += ('\"');
 195
 196   linestr.replace(origpos, (endpos-origpos+1), outstr);
 197
 198   return false; // OK
 199 }
 200
 201 /**
 202  * fix the u"x"/u'x'/u8"x" string at the position
 203  * u8'x' is not supported, sorry.
 204  * @param linestr the input string
 205  * @param pos the position
 206  * @return false = no err, true = had err
 207  */
 208 bool fixAt(std::string &linestr, size_t pos) {
 209   size_t origpos = pos;
 210
 211   if(linestr[pos] != 'u') {
 212     fprintf(stderr, "Not a 'u'?");
 213     return true;
 214   }
 215
 216   pos++; // past 'u'
 217
 218   bool utf8 = false;
 219
 220   if(linestr[pos] == '8') { // u8"
 221     utf8 = true;
 222     pos++;
 223   }
 224
 225   char quote = linestr[pos];
 226
 227   if(quote != '\'' && quote != '\"') {
 228     fprintf(stderr, "Quote is '%c' - not sure what to do.\n", quote);
 229     return true;
 230   }
 231
 232   if(quote == '\'' && utf8) {
 233     fprintf(stderr, "Cannot do u8'...'\n");
 234     return true;
 235   }
 236
 237   pos ++;
 238
 239   //printf("u%c…%c\n", quote, quote);
 240
 241   for(; pos < linestr.size(); pos++) {
 242     if(linestr[pos] == quote) {
 243       if(utf8) {
 244         return fixu8(linestr, origpos, pos); // fix u8"..."
 245       } else {
 246         return false; // end of quote
 247       }
 248     }
 249     if(linestr[pos] == '\\') {
 250       pos++;
 251       if(linestr[pos] == quote) continue; // quoted quote
 252       if(linestr[pos] == 'u') continue; // for now ... unicode escape
 253       if(linestr[pos] == '\\') continue;
 254       // some other escape… ignore
 255     } else {
 256       size_t old_pos = pos;
 257       int32_t i = pos;
 258 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
 259       // mogrify 1-4 bytes from 1047 'back' to utf-8
 260       char old_byte = linestr[pos];
 261       linestr[pos] = cp1047_to_8859(linestr[pos]);
 262       // how many more?
 263       int32_t trail = U8_COUNT_TRAIL_BYTES(linestr[pos]);
 264       for(size_t pos2 = pos+1; trail>0; pos2++,trail--) {
 265         linestr[pos2] = cp1047_to_8859(linestr[pos2]);
 266         if(linestr[pos2] == 0x0A) {
 267           linestr[pos2] = 0x85; // NL is ambiguous here
 268         }
 269       }
 270 #endif
 271
 272       // Proceed to decode utf-8
 273       const uint8_t *s = (const uint8_t*) (linestr.c_str());
 274       int32_t length = linestr.size();
 275       UChar32 c;
 276       if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) {
 277 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
 278         linestr[pos] = old_byte; // put it back
 279 #endif
 280         continue; // single code point not previously legal for \u escaping
 281       }
 282
 283       // otherwise, convert it to \u / \U
 284       {
 285         U8_NEXT(s, i, length, c);
 286       }
 287       if(c<0) {
 288         fprintf(stderr, "Illegal utf-8 sequence at Column: %d\n", (int)old_pos);
 289         fprintf(stderr, "Line: >>%s<<\n", linestr.c_str());
 290         return true;
 291       }
 292
 293       size_t seqLen = (i-pos);
 294
 295       //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);fflush(stdout);
 296
 297       char newSeq[20];
 298       if( c <= 0xFFFF) {
 299         sprintf(newSeq, "\\u%04X", c);
 300       } else {
 301         sprintf(newSeq, "\\U%08X", c);
 302       }
 303       linestr.replace(pos, seqLen, newSeq);
 304       pos += strlen(newSeq) - 1;
 305     }
 306   }
 307
 308   return false;
 309 }
 310
 311 /**
 312  * Fixup an entire line
 313  * false = no err
 314  * true = had err
 315  * @param no the line number (not used)
 316  * @param linestr the string to fix
 317  * @return true if any err, else false
 318  */
 319 bool fixLine(int /*no*/, std::string &linestr) {
 320   const char *line = linestr.c_str();
 321   size_t len = linestr.size();
 322
 323   // no u' in the line?
 324   if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) {
 325     return false; // Nothing to do. No u' or u" detected
 326   }
 327
 328   // start from the end and find all u" cases
 329   size_t pos = len = linestr.size();
 330   while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) {
 331     //printf("found doublequote at %d\n", pos);
 332     if(fixAt(linestr, pos)) return true;
 333     if(pos == 0) break;
 334     pos--;
 335   }
 336
 337   // reset and find all u' cases
 338   pos = len = linestr.size();
 339   while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) {
 340     //printf("found singlequote at %d\n", pos);
 341     if(fixAt(linestr, pos)) return true;
 342     if(pos == 0) break;
 343     pos--;
 344   }
 345
 346   // reset and find all u8" cases
 347   pos = len = linestr.size();
 348   while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) {
 349     if(fixAt(linestr, pos)) return true;
 350     if(pos == 0) break;
 351     pos--;
 352   }
 353
 354   //fprintf(stderr, "%d - fixed\n", no);
 355   return false;
 356 }
 357
 358 /**
 359  * Convert a whole file
 360  * @param infile
 361  * @param outfile
 362  * @return 1 on err, 0 otherwise
 363  */
 364 int convert(const std::string &infile, const std::string &outfile) {
 365   fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str());
 366
 367   std::ifstream inf;
 368
 369   inf.open(infile.c_str(), std::ios::in);
 370
 371   if(!inf.is_open()) {
 372     fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str());
 373     cleanup(outfile);
 374     return 1;
 375   }
 376
 377   std::ofstream outf;
 378
 379   outf.open(outfile.c_str(), std::ios::out);
 380
 381   if(!outf.is_open()) {
 382     fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str());
 383     return 1;
 384   }
 385
 386   // TODO: any platform variations of #line?
 387   outf << "#line 1 \"" << infile << "\"" << '\n';
 388
 389   int no = 0;
 390   std::string linestr;
 391   while( getline( inf, linestr)) {
 392     no++;
 393     if(fixLine(no, linestr)) {
 394       outf.close();
 395       fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str());
 396       cleanup(outfile);
 397       return 1;
 398     }
 399     outf << linestr << '\n';
 400   }
 401
 402   return 0;
 403 }
 404
 405 /**
 406  * Main function
 407  */
 408 int main(int argc, const char *argv[]) {
 409   prog = argv[0];
 410
 411   if(argc != 3) {
 412     usage();
 413     return 1;
 414   }
 415
 416   std::string infile = argv[1];
 417   std::string outfile = argv[2];
 418
 419   return convert(infile, outfile);
 420 }