git.saurik.com Git - apple/icu.git/blame - icuSources/tools/genrb/read.c

Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f A	3	/*
	4	*******************************************************************************
	5	*
51004dcb	6	* Copyright (C) 1998-2012, International Business Machines
b75a7d8f A	7	* Corporation and others. All Rights Reserved.
	8	*
	9	*******************************************************************************
	10	*
	11	* File read.c
	12	*
	13	* Modification History:
	14	*
	15	* Date Name Description
	16	* 05/26/99 stephen Creation.
	17	* 5/10/01 Ram removed ustdio dependency
	18	*******************************************************************************
	19	*/
	20
	21	#include "read.h"
	22	#include "errmsg.h"
	23	#include "unicode/ustring.h"
51004dcb	24	#include "unicode/utf16.h"
b75a7d8f A	25
	26	#define OPENBRACE 0x007B
	27	#define CLOSEBRACE 0x007D
	28	#define COMMA 0x002C
	29	#define QUOTE 0x0022
	30	#define ESCAPE 0x005C
	31	#define SLASH 0x002F
	32	#define ASTERISK 0x002A
	33	#define SPACE 0x0020
	34	#define COLON 0x003A
	35	#define BADBOM 0xFFFE
374ca955 A	36	#define CR 0x000D
	37	#define LF 0x000A
	38
b75a7d8f A	39	static int32_t lineCount;
	40
	41	/* Protos */
	42	static enum ETokenType getStringToken(UCHARBUF *buf,
	43	UChar32 initialChar,
	44	struct UString *token,
	45	UErrorCode *status);
	46
374ca955 A	47	static UChar32 getNextChar (UCHARBUF buf, UBool skipwhite, struct UString token, UErrorCode *status);
	48	static void seekUntilNewline (UCHARBUF buf, struct UString token, UErrorCode *status);
	49	static void seekUntilEndOfComment (UCHARBUF buf, struct UString token, UErrorCode *status);
b75a7d8f A	50	static UBool isWhitespace (UChar32 c);
	51	static UBool isNewline (UChar32 c);
	52
4388f060	53	U_CFUNC void resetLineNumber() {
b75a7d8f A	54	lineCount = 1;
	55	}
	56
	57	/* Read and return the next token from the stream. If the token is of
	58	type eString, fill in the token parameter with the token. If the
	59	token is eError, then the status parameter will contain the
	60	specific error. This will be eItemNotFound at the end of file,
	61	indicating that all tokens have been returned. This method will
	62	never return eString twice in a row; instead, multiple adjacent
	63	string tokens will be merged into one, with no intervening
	64	space. */
4388f060 A	65	U_CFUNC enum ETokenType
	66	getNextToken(UCHARBUF* buf,
	67	struct UString *token,
	68	uint32_t linenumber, / out: linenumber of token */
	69	struct UString *comment,
	70	UErrorCode *status) {
b75a7d8f A	71	enum ETokenType result;
	72	UChar32 c;
	73
	74	if (U_FAILURE(*status)) {
	75	return TOK_ERROR;
	76	}
	77
	78	/* Skip whitespace */
374ca955	79	c = getNextChar(buf, TRUE, comment, status);
b75a7d8f A	80
	81	if (U_FAILURE(*status)) {
	82	return TOK_ERROR;
	83	}
	84
	85	*linenumber = lineCount;
	86
	87	switch(c) {
	88	case BADBOM:
	89	return TOK_ERROR;
	90	case OPENBRACE:
	91	return TOK_OPEN_BRACE;
	92	case CLOSEBRACE:
	93	return TOK_CLOSE_BRACE;
	94	case COMMA:
	95	return TOK_COMMA;
	96	case U_EOF:
	97	return TOK_EOF;
	98	case COLON:
	99	return TOK_COLON;
	100
	101	default:
	102	result = getStringToken(buf, c, token, status);
	103	}
	104
	105	*linenumber = lineCount;
	106	return result;
	107	}
	108
	109	/* Copy a string token into the given UnicodeString. Upon entry, we
	110	have already read the first character of the string token, which is
	111	not a whitespace character (but may be a QUOTE or ESCAPE). This
	112	function reads all subsequent characters that belong with this
	113	string, and copy them into the token parameter. The other
	114	important, and slightly convoluted purpose of this function is to
	115	merge adjacent strings. It looks forward a bit, and if the next
	116	non comment, non whitespace item is a string, it reads it in as
	117	well. If two adjacent strings are quoted, they are merged without
	118	intervening space. Otherwise a single SPACE character is
	119	inserted. */
	120	static enum ETokenType getStringToken(UCHARBUF* buf,
	121	UChar32 initialChar,
	122	struct UString *token,
	123	UErrorCode *status) {
	124	UBool lastStringWasQuoted;
	125	UChar32 c;
	126	UChar target[3] = { '\0' };
	127	UChar *pTarget = target;
	128	int len=0;
	129	UBool isFollowingCharEscaped=FALSE;
374ca955 A	130	UBool isNLUnescaped = FALSE;
374ca955 A	131	UChar32 prevC=0;
b75a7d8f A	132
	133	/* We are guaranteed on entry that initialChar is not a whitespace
	134	character. If we are at the EOF, or have some other problem, it
	135	doesn't matter; we still want to validly return the initialChar
	136	(if nothing else) as a string token. */
	137
	138	if (U_FAILURE(*status)) {
	139	return TOK_ERROR;
	140	}
	141
	142	/* setup */
	143	lastStringWasQuoted = FALSE;
	144	c = initialChar;
	145	ustr_setlen(token, 0, status);
	146
	147	if (U_FAILURE(*status)) {
	148	return TOK_ERROR;
	149	}
	150
	151	for (;;) {
	152	if (c == QUOTE) {
	153	if (!lastStringWasQuoted && token->fLength > 0) {
	154	ustr_ucat(token, SPACE, status);
	155
	156	if (U_FAILURE(*status)) {
	157	return TOK_ERROR;
	158	}
	159	}
	160
	161	lastStringWasQuoted = TRUE;
	162
	163	for (;;) {
	164	c = ucbuf_getc(buf,status);
	165
	166	/* EOF reached */
	167	if (c == U_EOF) {
	168	return TOK_EOF;
	169	}
	170
	171	/* Unterminated quoted strings */
	172	if (U_FAILURE(*status)) {
	173	return TOK_ERROR;
	174	}
	175
	176	if (c == QUOTE && !isFollowingCharEscaped) {
	177	break;
	178	}
	179
	180	if (c == ESCAPE && !isFollowingCharEscaped) {
	181	pTarget = target;
	182	c = unescape(buf, status);
	183
	184	if (c == U_ERR) {
	185	return TOK_ERROR;
	186	}
374ca955 A	187	if(c == CR \|\| c == LF){
	188	isNLUnescaped = TRUE;
	189	}
b75a7d8f A	190	}
	191
	192	if(c==ESCAPE && !isFollowingCharEscaped){
	193	isFollowingCharEscaped = TRUE;
	194	}else{
	195	U_APPEND_CHAR32(c, pTarget,len);
	196	pTarget = target;
	197	ustr_uscat(token, pTarget,len, status);
	198	isFollowingCharEscaped = FALSE;
	199	len=0;
374ca955 A	200	if(c == CR \|\| c == LF){
	201	if(isNLUnescaped == FALSE && prevC!=CR){
	202	lineCount++;
	203	}
	204	isNLUnescaped = FALSE;
	205	}
b75a7d8f A	206	}
	207
	208	if (U_FAILURE(*status)) {
	209	return TOK_ERROR;
	210	}
374ca955	211	prevC = c;
b75a7d8f A	212	}
	213	} else {
	214	if (token->fLength > 0) {
	215	ustr_ucat(token, SPACE, status);
	216
	217	if (U_FAILURE(*status)) {
	218	return TOK_ERROR;
	219	}
	220	}
	221
	222	if(lastStringWasQuoted){
	223	if(getShowWarning()){
	224	warning(lineCount, "Mixing quoted and unquoted strings");
	225	}
	226	if(isStrict()){
	227	return TOK_ERROR;
	228	}
	229
	230	}
	231
	232	lastStringWasQuoted = FALSE;
	233
	234	/* if we reach here we are mixing
	235	* quoted and unquoted strings
	236	* warn in normal mode and error in
	237	* pedantic mode
	238	*/
	239
	240	if (c == ESCAPE) {
	241	pTarget = target;
	242	c = unescape(buf, status);
	243
	244	/* EOF reached */
	245	if (c == U_EOF) {
	246	return TOK_ERROR;
	247	}
	248	}
	249
	250	U_APPEND_CHAR32(c, pTarget,len);
	251	pTarget = target;
	252	ustr_uscat(token, pTarget,len, status);
	253	len=0;
374ca955	254
b75a7d8f A	255	if (U_FAILURE(*status)) {
	256	return TOK_ERROR;
	257	}
	258
	259	for (;;) {
	260	/* DON'T skip whitespace */
374ca955	261	c = getNextChar(buf, FALSE, NULL, status);
b75a7d8f A	262
	263	/* EOF reached */
	264	if (c == U_EOF) {
	265	ucbuf_ungetc(c, buf);
	266	return TOK_STRING;
	267	}
	268
	269	if (U_FAILURE(*status)) {
	270	return TOK_STRING;
	271	}
	272
	273	if (c == QUOTE
	274	\|\| c == OPENBRACE
	275	\|\| c == CLOSEBRACE
	276	\|\| c == COMMA
	277	\|\| c == COLON) {
	278	ucbuf_ungetc(c, buf);
	279	break;
	280	}
	281
	282	if (isWhitespace(c)) {
	283	break;
	284	}
	285
	286	if (c == ESCAPE) {
	287	pTarget = target;
	288	c = unescape(buf, status);
	289
	290	if (c == U_ERR) {
	291	return TOK_ERROR;
	292	}
	293	}
	294
	295	U_APPEND_CHAR32(c, pTarget,len);
	296	pTarget = target;
	297	ustr_uscat(token, pTarget,len, status);
	298	len=0;
	299	if (U_FAILURE(*status)) {
	300	return TOK_ERROR;
	301	}
	302	}
	303	}
	304
	305	/* DO skip whitespace */
374ca955	306	c = getNextChar(buf, TRUE, NULL, status);
b75a7d8f A	307
	308	if (U_FAILURE(*status)) {
	309	return TOK_STRING;
	310	}
	311
	312	if (c == OPENBRACE \|\| c == CLOSEBRACE \|\| c == COMMA \|\| c == COLON) {
	313	ucbuf_ungetc(c, buf);
	314	return TOK_STRING;
	315	}
	316	}
	317	}
	318
374ca955	319	/* Retrieve the next character. If skipwhite is
b75a7d8f A	320	true, whitespace is skipped as well. */
	321	static UChar32 getNextChar(UCHARBUF* buf,
	322	UBool skipwhite,
374ca955	323	struct UString *token,
b75a7d8f	324	UErrorCode *status) {
374ca955	325	UChar32 c, c2;
b75a7d8f A	326
	327	if (U_FAILURE(*status)) {
	328	return U_EOF;
	329	}
	330
	331	for (;;) {
	332	c = ucbuf_getc(buf,status);
	333
	334	if (c == U_EOF) {
	335	return U_EOF;
	336	}
	337
	338	if (skipwhite && isWhitespace(c)) {
	339	continue;
	340	}
	341
	342	/* This also handles the get() failing case */
	343	if (c != SLASH) {
	344	return c;
	345	}
	346
46f4442e	347	c = ucbuf_getc(buf,status); /* "/c" */
b75a7d8f A	348
	349	if (c == U_EOF) {
	350	return U_EOF;
	351	}
	352
	353	switch (c) {
46f4442e	354	case SLASH: /* "//" */
374ca955	355	seekUntilNewline(buf, NULL, status);
b75a7d8f A	356	break;
b75a7d8f A	357
729e4ab9 A	358	case ASTERISK: /* " / * " */
	359	c2 = ucbuf_getc(buf, status); /* "/ * c" */
	360	if(c2 == ASTERISK){ /* "/ * " /
374ca955 A	361	/* parse multi-line comment and store it in token*/
374ca955 A	362	seekUntilEndOfComment(buf, token, status);
46f4442e	363	} else {
729e4ab9	364	ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ ". Include c2 back in buffer. /
374ca955 A	365	seekUntilEndOfComment(buf, NULL, status);
374ca955 A	366	}
b75a7d8f A	367	break;
	368
	369	default:
46f4442e	370	ucbuf_ungetc(c, buf); /* "/c" - put back the c */
b75a7d8f A	371	/* If get() failed this is a NOP */
	372	return SLASH;
	373	}
374ca955	374
b75a7d8f A	375	}
	376	}
	377
	378	static void seekUntilNewline(UCHARBUF* buf,
374ca955	379	struct UString *token,
b75a7d8f A	380	UErrorCode *status) {
	381	UChar32 c;
	382
	383	if (U_FAILURE(*status)) {
	384	return;
	385	}
	386
	387	do {
	388	c = ucbuf_getc(buf,status);
374ca955 A	389	/* add the char to token */
	390	if(token!=NULL){
	391	ustr_u32cat(token, c, status);
	392	}
b75a7d8f A	393	} while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
	394	}
	395
	396	static void seekUntilEndOfComment(UCHARBUF *buf,
374ca955	397	struct UString *token,
b75a7d8f A	398	UErrorCode *status) {
	399	UChar32 c, d;
	400	uint32_t line;
	401
	402	if (U_FAILURE(*status)) {
	403	return;
	404	}
	405
	406	line = lineCount;
	407
	408	do {
	409	c = ucbuf_getc(buf, status);
	410
	411	if (c == ASTERISK) {
	412	d = ucbuf_getc(buf, status);
	413
	414	if (d != SLASH) {
	415	ucbuf_ungetc(d, buf);
	416	} else {
	417	break;
	418	}
	419	}
374ca955 A	420	/* add the char to token */
	421	if(token!=NULL){
	422	ustr_u32cat(token, c, status);
	423	}
	424	/* increment the lineCount */
	425	isNewline(c);
	426
b75a7d8f A	427	} while (c != U_EOF && *status == U_ZERO_ERROR);
	428
	429	if (c == U_EOF) {
	430	*status = U_INVALID_FORMAT_ERROR;
	431	error(line, "unterminated comment detected");
	432	}
	433	}
	434
4388f060	435	U_CFUNC UChar32 unescape(UCHARBUF buf, UErrorCode status) {
b75a7d8f A	436	if (U_FAILURE(*status)) {
	437	return U_EOF;
	438	}
	439
	440	/* We expect to be called after the ESCAPE has been seen, but
	441	* u_fgetcx needs an ESCAPE to do its magic. */
	442	ucbuf_ungetc(ESCAPE, buf);
	443
	444	return ucbuf_getcx32(buf, status);
	445	}
	446
	447	static UBool isWhitespace(UChar32 c) {
	448	switch (c) {
	449	/* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
	450	case 0x000A:
	451	case 0x2029:
	452	lineCount++;
	453	case 0x000D:
	454	case 0x0020:
	455	case 0x0009:
	456	case 0xFEFF:
	457	return TRUE;
	458
	459	default:
	460	return FALSE;
	461	}
	462	}
	463
	464	static UBool isNewline(UChar32 c) {
	465	switch (c) {
	466	/* '\n', '\r', 0x2029 */
	467	case 0x000A:
	468	case 0x2029:
	469	lineCount++;
	470	case 0x000D:
	471	return TRUE;
	472
	473	default:
	474	return FALSE;
	475	}
	476	}