git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 1998-2008, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	*
	9	* File read.c
	10	*
	11	* Modification History:
	12	*
	13	* Date Name Description
	14	* 05/26/99 stephen Creation.
	15	* 5/10/01 Ram removed ustdio dependency
	16	*******************************************************************************
	17	*/
	18
	19	#include "read.h"
	20	#include "errmsg.h"
	21	#include "unicode/ustring.h"
	22
	23	#define OPENBRACE 0x007B
	24	#define CLOSEBRACE 0x007D
	25	#define COMMA 0x002C
	26	#define QUOTE 0x0022
	27	#define ESCAPE 0x005C
	28	#define SLASH 0x002F
	29	#define ASTERISK 0x002A
	30	#define SPACE 0x0020
	31	#define COLON 0x003A
	32	#define BADBOM 0xFFFE
	33	#define CR 0x000D
	34	#define LF 0x000A
	35
	36	static int32_t lineCount;
	37
	38	/* Protos */
	39	static enum ETokenType getStringToken(UCHARBUF *buf,
	40	UChar32 initialChar,
	41	struct UString *token,
	42	UErrorCode *status);
	43
	44	static UChar32 getNextChar (UCHARBUF buf, UBool skipwhite, struct UString token, UErrorCode *status);
	45	static void seekUntilNewline (UCHARBUF buf, struct UString token, UErrorCode *status);
	46	static void seekUntilEndOfComment (UCHARBUF buf, struct UString token, UErrorCode *status);
	47	static UBool isWhitespace (UChar32 c);
	48	static UBool isNewline (UChar32 c);
	49
	50	void resetLineNumber() {
	51	lineCount = 1;
	52	}
	53
	54	/* Read and return the next token from the stream. If the token is of
	55	type eString, fill in the token parameter with the token. If the
	56	token is eError, then the status parameter will contain the
	57	specific error. This will be eItemNotFound at the end of file,
	58	indicating that all tokens have been returned. This method will
	59	never return eString twice in a row; instead, multiple adjacent
	60	string tokens will be merged into one, with no intervening
	61	space. */
	62	enum ETokenType getNextToken(UCHARBUF* buf,
	63	struct UString *token,
	64	uint32_t linenumber, / out: linenumber of token */
	65	struct UString *comment,
	66	UErrorCode *status) {
	67	enum ETokenType result;
	68	UChar32 c;
	69
	70	if (U_FAILURE(*status)) {
	71	return TOK_ERROR;
	72	}
	73
	74	/* Skip whitespace */
	75	c = getNextChar(buf, TRUE, comment, status);
	76
	77	if (U_FAILURE(*status)) {
	78	return TOK_ERROR;
	79	}
	80
	81	*linenumber = lineCount;
	82
	83	switch(c) {
	84	case BADBOM:
	85	return TOK_ERROR;
	86	case OPENBRACE:
	87	return TOK_OPEN_BRACE;
	88	case CLOSEBRACE:
	89	return TOK_CLOSE_BRACE;
	90	case COMMA:
	91	return TOK_COMMA;
	92	case U_EOF:
	93	return TOK_EOF;
	94	case COLON:
	95	return TOK_COLON;
	96
	97	default:
	98	result = getStringToken(buf, c, token, status);
	99	}
	100
	101	*linenumber = lineCount;
	102	return result;
	103	}
	104
	105	/* Copy a string token into the given UnicodeString. Upon entry, we
	106	have already read the first character of the string token, which is
	107	not a whitespace character (but may be a QUOTE or ESCAPE). This
	108	function reads all subsequent characters that belong with this
	109	string, and copy them into the token parameter. The other
	110	important, and slightly convoluted purpose of this function is to
	111	merge adjacent strings. It looks forward a bit, and if the next
	112	non comment, non whitespace item is a string, it reads it in as
	113	well. If two adjacent strings are quoted, they are merged without
	114	intervening space. Otherwise a single SPACE character is
	115	inserted. */
	116	static enum ETokenType getStringToken(UCHARBUF* buf,
	117	UChar32 initialChar,
	118	struct UString *token,
	119	UErrorCode *status) {
	120	UBool lastStringWasQuoted;
	121	UChar32 c;
	122	UChar target[3] = { '\0' };
	123	UChar *pTarget = target;
	124	int len=0;
	125	UBool isFollowingCharEscaped=FALSE;
	126	UBool isNLUnescaped = FALSE;
	127	UChar32 prevC=0;
	128
	129	/* We are guaranteed on entry that initialChar is not a whitespace
	130	character. If we are at the EOF, or have some other problem, it
	131	doesn't matter; we still want to validly return the initialChar
	132	(if nothing else) as a string token. */
	133
	134	if (U_FAILURE(*status)) {
	135	return TOK_ERROR;
	136	}
	137
	138	/* setup */
	139	lastStringWasQuoted = FALSE;
	140	c = initialChar;
	141	ustr_setlen(token, 0, status);
	142
	143	if (U_FAILURE(*status)) {
	144	return TOK_ERROR;
	145	}
	146
	147	for (;;) {
	148	if (c == QUOTE) {
	149	if (!lastStringWasQuoted && token->fLength > 0) {
	150	ustr_ucat(token, SPACE, status);
	151
	152	if (U_FAILURE(*status)) {
	153	return TOK_ERROR;
	154	}
	155	}
	156
	157	lastStringWasQuoted = TRUE;
	158
	159	for (;;) {
	160	c = ucbuf_getc(buf,status);
	161
	162	/* EOF reached */
	163	if (c == U_EOF) {
	164	return TOK_EOF;
	165	}
	166
	167	/* Unterminated quoted strings */
	168	if (U_FAILURE(*status)) {
	169	return TOK_ERROR;
	170	}
	171
	172	if (c == QUOTE && !isFollowingCharEscaped) {
	173	break;
	174	}
	175
	176	if (c == ESCAPE && !isFollowingCharEscaped) {
	177	pTarget = target;
	178	c = unescape(buf, status);
	179
	180	if (c == U_ERR) {
	181	return TOK_ERROR;
	182	}
	183	if(c == CR \|\| c == LF){
	184	isNLUnescaped = TRUE;
	185	}
	186	}
	187
	188	if(c==ESCAPE && !isFollowingCharEscaped){
	189	isFollowingCharEscaped = TRUE;
	190	}else{
	191	U_APPEND_CHAR32(c, pTarget,len);
	192	pTarget = target;
	193	ustr_uscat(token, pTarget,len, status);
	194	isFollowingCharEscaped = FALSE;
	195	len=0;
	196	if(c == CR \|\| c == LF){
	197	if(isNLUnescaped == FALSE && prevC!=CR){
	198	lineCount++;
	199	}
	200	isNLUnescaped = FALSE;
	201	}
	202	}
	203
	204	if (U_FAILURE(*status)) {
	205	return TOK_ERROR;
	206	}
	207	prevC = c;
	208	}
	209	} else {
	210	if (token->fLength > 0) {
	211	ustr_ucat(token, SPACE, status);
	212
	213	if (U_FAILURE(*status)) {
	214	return TOK_ERROR;
	215	}
	216	}
	217
	218	if(lastStringWasQuoted){
	219	if(getShowWarning()){
	220	warning(lineCount, "Mixing quoted and unquoted strings");
	221	}
	222	if(isStrict()){
	223	return TOK_ERROR;
	224	}
	225
	226	}
	227
	228	lastStringWasQuoted = FALSE;
	229
	230	/* if we reach here we are mixing
	231	* quoted and unquoted strings
	232	* warn in normal mode and error in
	233	* pedantic mode
	234	*/
	235
	236	if (c == ESCAPE) {
	237	pTarget = target;
	238	c = unescape(buf, status);
	239
	240	/* EOF reached */
	241	if (c == U_EOF) {
	242	return TOK_ERROR;
	243	}
	244	}
	245
	246	U_APPEND_CHAR32(c, pTarget,len);
	247	pTarget = target;
	248	ustr_uscat(token, pTarget,len, status);
	249	len=0;
	250
	251	if (U_FAILURE(*status)) {
	252	return TOK_ERROR;
	253	}
	254
	255	for (;;) {
	256	/* DON'T skip whitespace */
	257	c = getNextChar(buf, FALSE, NULL, status);
	258
	259	/* EOF reached */
	260	if (c == U_EOF) {
	261	ucbuf_ungetc(c, buf);
	262	return TOK_STRING;
	263	}
	264
	265	if (U_FAILURE(*status)) {
	266	return TOK_STRING;
	267	}
	268
	269	if (c == QUOTE
	270	\|\| c == OPENBRACE
	271	\|\| c == CLOSEBRACE
	272	\|\| c == COMMA
	273	\|\| c == COLON) {
	274	ucbuf_ungetc(c, buf);
	275	break;
	276	}
	277
	278	if (isWhitespace(c)) {
	279	break;
	280	}
	281
	282	if (c == ESCAPE) {
	283	pTarget = target;
	284	c = unescape(buf, status);
	285
	286	if (c == U_ERR) {
	287	return TOK_ERROR;
	288	}
	289	}
	290
	291	U_APPEND_CHAR32(c, pTarget,len);
	292	pTarget = target;
	293	ustr_uscat(token, pTarget,len, status);
	294	len=0;
	295	if (U_FAILURE(*status)) {
	296	return TOK_ERROR;
	297	}
	298	}
	299	}
	300
	301	/* DO skip whitespace */
	302	c = getNextChar(buf, TRUE, NULL, status);
	303
	304	if (U_FAILURE(*status)) {
	305	return TOK_STRING;
	306	}
	307
	308	if (c == OPENBRACE \|\| c == CLOSEBRACE \|\| c == COMMA \|\| c == COLON) {
	309	ucbuf_ungetc(c, buf);
	310	return TOK_STRING;
	311	}
	312	}
	313	}
	314
	315	/* Retrieve the next character. If skipwhite is
	316	true, whitespace is skipped as well. */
	317	static UChar32 getNextChar(UCHARBUF* buf,
	318	UBool skipwhite,
	319	struct UString *token,
	320	UErrorCode *status) {
	321	UChar32 c, c2;
	322
	323	if (U_FAILURE(*status)) {
	324	return U_EOF;
	325	}
	326
	327	for (;;) {
	328	c = ucbuf_getc(buf,status);
	329
	330	if (c == U_EOF) {
	331	return U_EOF;
	332	}
	333
	334	if (skipwhite && isWhitespace(c)) {
	335	continue;
	336	}
	337
	338	/* This also handles the get() failing case */
	339	if (c != SLASH) {
	340	return c;
	341	}
	342
	343	c = ucbuf_getc(buf,status); /* "/c" */
	344
	345	if (c == U_EOF) {
	346	return U_EOF;
	347	}
	348
	349	switch (c) {
	350	case SLASH: /* "//" */
	351	seekUntilNewline(buf, NULL, status);
	352	break;
	353
	354	case ASTERISK: /* "/" /
	355	c2 = ucbuf_getc(buf, status); /* "/c" /
	356	if(c2 == ASTERISK){ /* "/*" /
	357	/* parse multi-line comment and store it in token*/
	358	seekUntilEndOfComment(buf, token, status);
	359	} else {
	360	ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/". Include c2 back in buffer. /
	361	seekUntilEndOfComment(buf, NULL, status);
	362	}
	363	break;
	364
	365	default:
	366	ucbuf_ungetc(c, buf); /* "/c" - put back the c */
	367	/* If get() failed this is a NOP */
	368	return SLASH;
	369	}
	370
	371	}
	372	}
	373
	374	static void seekUntilNewline(UCHARBUF* buf,
	375	struct UString *token,
	376	UErrorCode *status) {
	377	UChar32 c;
	378
	379	if (U_FAILURE(*status)) {
	380	return;
	381	}
	382
	383	do {
	384	c = ucbuf_getc(buf,status);
	385	/* add the char to token */
	386	if(token!=NULL){
	387	ustr_u32cat(token, c, status);
	388	}
	389	} while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
	390	}
	391
	392	static void seekUntilEndOfComment(UCHARBUF *buf,
	393	struct UString *token,
	394	UErrorCode *status) {
	395	UChar32 c, d;
	396	uint32_t line;
	397
	398	if (U_FAILURE(*status)) {
	399	return;
	400	}
	401
	402	line = lineCount;
	403
	404	do {
	405	c = ucbuf_getc(buf, status);
	406
	407	if (c == ASTERISK) {
	408	d = ucbuf_getc(buf, status);
	409
	410	if (d != SLASH) {
	411	ucbuf_ungetc(d, buf);
	412	} else {
	413	break;
	414	}
	415	}
	416	/* add the char to token */
	417	if(token!=NULL){
	418	ustr_u32cat(token, c, status);
	419	}
	420	/* increment the lineCount */
	421	isNewline(c);
	422
	423	} while (c != U_EOF && *status == U_ZERO_ERROR);
	424
	425	if (c == U_EOF) {
	426	*status = U_INVALID_FORMAT_ERROR;
	427	error(line, "unterminated comment detected");
	428	}
	429	}
	430
	431	UChar32 unescape(UCHARBUF *buf,
	432	UErrorCode *status) {
	433	if (U_FAILURE(*status)) {
	434	return U_EOF;
	435	}
	436
	437	/* We expect to be called after the ESCAPE has been seen, but
	438	* u_fgetcx needs an ESCAPE to do its magic. */
	439	ucbuf_ungetc(ESCAPE, buf);
	440
	441	return ucbuf_getcx32(buf, status);
	442	}
	443
	444	static UBool isWhitespace(UChar32 c) {
	445	switch (c) {
	446	/* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
	447	case 0x000A:
	448	case 0x2029:
	449	lineCount++;
	450	case 0x000D:
	451	case 0x0020:
	452	case 0x0009:
	453	case 0xFEFF:
	454	return TRUE;
	455
	456	default:
	457	return FALSE;
	458	}
	459	}
	460
	461	static UBool isNewline(UChar32 c) {
	462	switch (c) {
	463	/* '\n', '\r', 0x2029 */
	464	case 0x000A:
	465	case 0x2029:
	466	lineCount++;
	467	case 0x000D:
	468	return TRUE;
	469
	470	default:
	471	return FALSE;
	472	}
	473	}