icuSources/tools/toolutil/uparse.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2000-2010, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  uparse.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2000apr18
  14 *   created by: Markus W. Scherer
  15 *
  16 *   This file provides a parser for files that are delimited by one single
  17 *   character like ';' or TAB. Example: the Unicode Character Properties files
  18 *   like UnicodeData.txt are semicolon-delimited.
  19 */
  20
  21 #include "unicode/utypes.h"
  22 #include "cstring.h"
  23 #include "filestrm.h"
  24 #include "uparse.h"
  25 #include "unicode/uchar.h"
  26 #include "unicode/ustring.h"
  27 #include "ustr_imp.h"
  28
  29 #include <stdio.h>
  30
  31 U_CAPI const char * U_EXPORT2
  32 u_skipWhitespace(const char *s) {
  33     while(U_IS_INV_WHITESPACE(*s)) {
  34         ++s;
  35     }
  36     return s;
  37 }
  38
  39 U_CAPI char * U_EXPORT2
  40 u_rtrim(char *s) {
  41     char *end=uprv_strchr(s, 0);
  42     while(s<end && U_IS_INV_WHITESPACE(*(end-1))) {
  43         *--end = 0;
  44     }
  45     return end;
  46 }
  47
  48 /*
  49  * If the string starts with # @missing: then return the pointer to the
  50  * following non-whitespace character.
  51  * Otherwise return the original pointer.
  52  * Unicode 5.0 adds such lines in some data files to document
  53  * default property values.
  54  * Poor man's regex for variable amounts of white space.
  55  */
  56 static const char *
  57 getMissingLimit(const char *s) {
  58     const char *s0=s;
  59     if(
  60         *(s=u_skipWhitespace(s))=='#' &&
  61         *(s=u_skipWhitespace(s+1))=='@' &&
  62         0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) &&
  63         *(s=u_skipWhitespace(s+7))==':'
  64     ) {
  65         return u_skipWhitespace(s+1);
  66     } else {
  67         return s0;
  68     }
  69 }
  70
  71 U_CAPI void U_EXPORT2
  72 u_parseDelimitedFile(const char *filename, char delimiter,
  73                      char *fields[][2], int32_t fieldCount,
  74                      UParseLineFn *lineFn, void *context,
  75                      UErrorCode *pErrorCode) {
  76     FileStream *file;
  77     char line[300];
  78     char *start, *limit;
  79     int32_t i, length;
  80
  81     if(U_FAILURE(*pErrorCode)) {
  82         return;
  83     }
  84
  85     if(fields==NULL || lineFn==NULL || fieldCount<=0) {
  86         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  87         return;
  88     }
  89
  90     if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
  91         filename=NULL;
  92         file=T_FileStream_stdin();
  93     } else {
  94         file=T_FileStream_open(filename, "r");
  95     }
  96     if(file==NULL) {
  97         *pErrorCode=U_FILE_ACCESS_ERROR;
  98         return;
  99     }
 100
 101     while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) {
 102         /* remove trailing newline characters */
 103         length=(int32_t)(u_rtrim(line)-line);
 104
 105         /*
 106          * detect a line with # @missing:
 107          * start parsing after that, or else from the beginning of the line
 108          * set the default warning for @missing lines
 109          */
 110         start=(char *)getMissingLimit(line);
 111         if(start==line) {
 112             *pErrorCode=U_ZERO_ERROR;
 113         } else {
 114             *pErrorCode=U_USING_DEFAULT_WARNING;
 115         }
 116
 117         /* skip this line if it is empty or a comment */
 118         if(*start==0 || *start=='#') {
 119             continue;
 120         }
 121
 122         /* remove in-line comments */
 123         limit=uprv_strchr(start, '#');
 124         if(limit!=NULL) {
 125             /* get white space before the pound sign */
 126             while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) {
 127                 --limit;
 128             }
 129
 130             /* truncate the line */
 131             *limit=0;
 132         }
 133
 134         /* skip lines with only whitespace */
 135         if(u_skipWhitespace(start)[0]==0) {
 136             continue;
 137         }
 138
 139         /* for each field, call the corresponding field function */
 140         for(i=0; i<fieldCount; ++i) {
 141             /* set the limit pointer of this field */
 142             limit=start;
 143             while(*limit!=delimiter && *limit!=0) {
 144                 ++limit;
 145             }
 146
 147             /* set the field start and limit in the fields array */
 148             fields[i][0]=start;
 149             fields[i][1]=limit;
 150
 151             /* set start to the beginning of the next field, if any */
 152             start=limit;
 153             if(*start!=0) {
 154                 ++start;
 155             } else if(i+1<fieldCount) {
 156                 *pErrorCode=U_PARSE_ERROR;
 157                 limit=line+length;
 158                 i=fieldCount;
 159                 break;
 160             }
 161         }
 162
 163         /* error in a field function? */
 164         if(U_FAILURE(*pErrorCode)) {
 165             break;
 166         }
 167
 168         /* call the field function */
 169         lineFn(context, fields, fieldCount, pErrorCode);
 170         if(U_FAILURE(*pErrorCode)) {
 171             break;
 172         }
 173     }
 174
 175     if(filename!=NULL) {
 176         T_FileStream_close(file);
 177     }
 178 }
 179
 180 /*
 181  * parse a list of code points
 182  * store them as a UTF-32 string in dest[destCapacity]
 183  * return the number of code points
 184  */
 185 U_CAPI int32_t U_EXPORT2
 186 u_parseCodePoints(const char *s,
 187                   uint32_t *dest, int32_t destCapacity,
 188                   UErrorCode *pErrorCode) {
 189     char *end;
 190     uint32_t value;
 191     int32_t count;
 192
 193     if(U_FAILURE(*pErrorCode)) {
 194         return 0;
 195     }
 196     if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
 197         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 198         return 0;
 199     }
 200
 201     count=0;
 202     for(;;) {
 203         s=u_skipWhitespace(s);
 204         if(*s==';' || *s==0) {
 205             return count;
 206         }
 207
 208         /* read one code point */
 209         value=(uint32_t)uprv_strtoul(s, &end, 16);
 210         if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
 211             *pErrorCode=U_PARSE_ERROR;
 212             return 0;
 213         }
 214
 215         /* append it to the destination array */
 216         if(count<destCapacity) {
 217             dest[count++]=value;
 218         } else {
 219             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 220         }
 221
 222         /* go to the following characters */
 223         s=end;
 224     }
 225 }
 226
 227 /*
 228  * parse a list of code points
 229  * store them as a string in dest[destCapacity]
 230  * set the first code point in *pFirst
 231  * @return The length of the string in numbers of UChars.
 232  */
 233 U_CAPI int32_t U_EXPORT2
 234 u_parseString(const char *s,
 235               UChar *dest, int32_t destCapacity,
 236               uint32_t *pFirst,
 237               UErrorCode *pErrorCode) {
 238     char *end;
 239     uint32_t value;
 240     int32_t destLength;
 241
 242     if(U_FAILURE(*pErrorCode)) {
 243         return 0;
 244     }
 245     if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
 246         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 247     }
 248
 249     if(pFirst!=NULL) {
 250         *pFirst=0xffffffff;
 251     }
 252
 253     destLength=0;
 254     for(;;) {
 255         s=u_skipWhitespace(s);
 256         if(*s==';' || *s==0) {
 257             if(destLength<destCapacity) {
 258                 dest[destLength]=0;
 259             } else if(destLength==destCapacity) {
 260                 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
 261             } else {
 262                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 263             }
 264             return destLength;
 265         }
 266
 267         /* read one code point */
 268         value=(uint32_t)uprv_strtoul(s, &end, 16);
 269         if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
 270             *pErrorCode=U_PARSE_ERROR;
 271             return 0;
 272         }
 273
 274         /* store the first code point */
 275         if(pFirst!=NULL) {
 276             *pFirst=value;
 277             pFirst=NULL;
 278         }
 279
 280         /* append it to the destination array */
 281         if((destLength+U16_LENGTH(value))<=destCapacity) {
 282             U16_APPEND_UNSAFE(dest, destLength, value);
 283         } else {
 284             destLength+=U16_LENGTH(value);
 285         }
 286
 287         /* go to the following characters */
 288         s=end;
 289     }
 290 }
 291
 292 /* read a range like start or start..end */
 293 U_CAPI int32_t U_EXPORT2
 294 u_parseCodePointRangeAnyTerminator(const char *s,
 295                                    uint32_t *pStart, uint32_t *pEnd,
 296                                    const char **terminator,
 297                                    UErrorCode *pErrorCode) {
 298     char *end;
 299     uint32_t value;
 300
 301     if(U_FAILURE(*pErrorCode)) {
 302         return 0;
 303     }
 304     if(s==NULL || pStart==NULL || pEnd==NULL) {
 305         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 306         return 0;
 307     }
 308
 309     /* read the start code point */
 310     s=u_skipWhitespace(s);
 311     value=(uint32_t)uprv_strtoul(s, &end, 16);
 312     if(end<=s || value>=0x110000) {
 313         *pErrorCode=U_PARSE_ERROR;
 314         return 0;
 315     }
 316     *pStart=*pEnd=value;
 317
 318     /* is there a "..end"? */
 319     s=u_skipWhitespace(end);
 320     if(*s!='.' || s[1]!='.') {
 321         *terminator=end;
 322         return 1;
 323     }
 324     s=u_skipWhitespace(s+2);
 325
 326     /* read the end code point */
 327     value=(uint32_t)uprv_strtoul(s, &end, 16);
 328     if(end<=s || value>=0x110000) {
 329         *pErrorCode=U_PARSE_ERROR;
 330         return 0;
 331     }
 332     *pEnd=value;
 333
 334     /* is this a valid range? */
 335     if(value<*pStart) {
 336         *pErrorCode=U_PARSE_ERROR;
 337         return 0;
 338     }
 339
 340     *terminator=end;
 341     return value-*pStart+1;
 342 }
 343
 344 U_CAPI int32_t U_EXPORT2
 345 u_parseCodePointRange(const char *s,
 346                       uint32_t *pStart, uint32_t *pEnd,
 347                       UErrorCode *pErrorCode) {
 348     const char *terminator;
 349     int32_t rangeLength=
 350         u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode);
 351     if(U_SUCCESS(*pErrorCode)) {
 352         terminator=u_skipWhitespace(terminator);
 353         if(*terminator!=';' && *terminator!=0) {
 354             *pErrorCode=U_PARSE_ERROR;
 355             return 0;
 356         }
 357     }
 358     return rangeLength;
 359 }
 360
 361 U_CAPI int32_t U_EXPORT2
 362 u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) {
 363     const char *read = source;
 364     int32_t i = 0;
 365     unsigned int value = 0;
 366     if(sLen == -1) {
 367         sLen = (int32_t)strlen(source);
 368     }
 369
 370     while(read < source+sLen) {
 371         sscanf(read, "%2x", &value);
 372         if(i < destCapacity) {
 373             dest[i] = (char)value;
 374         }
 375         i++;
 376         read += 2;
 377     }
 378     return u_terminateChars(dest, destCapacity, i, status);
 379 }