git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 1998-2011, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	*
	9	* File read.c
	10	*
	11	* Modification History:
	12	*
	13	* Date Name Description
	14	* 05/26/99 stephen Creation.
	15	* 5/10/01 Ram removed ustdio dependency
	16	*******************************************************************************
	17	*/
	18
	19	#include "read.h"
	20	#include "errmsg.h"
	21	#include "unicode/ustring.h"
	22
	23	#define OPENBRACE 0x007B
	24	#define CLOSEBRACE 0x007D
	25	#define COMMA 0x002C
	26	#define QUOTE 0x0022
	27	#define ESCAPE 0x005C
	28	#define SLASH 0x002F
	29	#define ASTERISK 0x002A
	30	#define SPACE 0x0020
	31	#define COLON 0x003A
	32	#define BADBOM 0xFFFE
	33	#define CR 0x000D
	34	#define LF 0x000A
	35
	36	static int32_t lineCount;
	37
	38	/* Protos */
	39	static enum ETokenType getStringToken(UCHARBUF *buf,
	40	UChar32 initialChar,
	41	struct UString *token,
	42	UErrorCode *status);
	43
	44	static UChar32 getNextChar (UCHARBUF buf, UBool skipwhite, struct UString token, UErrorCode *status);
	45	static void seekUntilNewline (UCHARBUF buf, struct UString token, UErrorCode *status);
	46	static void seekUntilEndOfComment (UCHARBUF buf, struct UString token, UErrorCode *status);
	47	static UBool isWhitespace (UChar32 c);
	48	static UBool isNewline (UChar32 c);
	49
	50	U_CFUNC void resetLineNumber() {
	51	lineCount = 1;
	52	}
	53
	54	/* Read and return the next token from the stream. If the token is of
	55	type eString, fill in the token parameter with the token. If the
	56	token is eError, then the status parameter will contain the
	57	specific error. This will be eItemNotFound at the end of file,
	58	indicating that all tokens have been returned. This method will
	59	never return eString twice in a row; instead, multiple adjacent
	60	string tokens will be merged into one, with no intervening
	61	space. */
	62	U_CFUNC enum ETokenType
	63	getNextToken(UCHARBUF* buf,
	64	struct UString *token,
	65	uint32_t linenumber, / out: linenumber of token */
	66	struct UString *comment,
	67	UErrorCode *status) {
	68	enum ETokenType result;
	69	UChar32 c;
	70
	71	if (U_FAILURE(*status)) {
	72	return TOK_ERROR;
	73	}
	74
	75	/* Skip whitespace */
	76	c = getNextChar(buf, TRUE, comment, status);
	77
	78	if (U_FAILURE(*status)) {
	79	return TOK_ERROR;
	80	}
	81
	82	*linenumber = lineCount;
	83
	84	switch(c) {
	85	case BADBOM:
	86	return TOK_ERROR;
	87	case OPENBRACE:
	88	return TOK_OPEN_BRACE;
	89	case CLOSEBRACE:
	90	return TOK_CLOSE_BRACE;
	91	case COMMA:
	92	return TOK_COMMA;
	93	case U_EOF:
	94	return TOK_EOF;
	95	case COLON:
	96	return TOK_COLON;
	97
	98	default:
	99	result = getStringToken(buf, c, token, status);
	100	}
	101
	102	*linenumber = lineCount;
	103	return result;
	104	}
	105
	106	/* Copy a string token into the given UnicodeString. Upon entry, we
	107	have already read the first character of the string token, which is
	108	not a whitespace character (but may be a QUOTE or ESCAPE). This
	109	function reads all subsequent characters that belong with this
	110	string, and copy them into the token parameter. The other
	111	important, and slightly convoluted purpose of this function is to
	112	merge adjacent strings. It looks forward a bit, and if the next
	113	non comment, non whitespace item is a string, it reads it in as
	114	well. If two adjacent strings are quoted, they are merged without
	115	intervening space. Otherwise a single SPACE character is
	116	inserted. */
	117	static enum ETokenType getStringToken(UCHARBUF* buf,
	118	UChar32 initialChar,
	119	struct UString *token,
	120	UErrorCode *status) {
	121	UBool lastStringWasQuoted;
	122	UChar32 c;
	123	UChar target[3] = { '\0' };
	124	UChar *pTarget = target;
	125	int len=0;
	126	UBool isFollowingCharEscaped=FALSE;
	127	UBool isNLUnescaped = FALSE;
	128	UChar32 prevC=0;
	129
	130	/* We are guaranteed on entry that initialChar is not a whitespace
	131	character. If we are at the EOF, or have some other problem, it
	132	doesn't matter; we still want to validly return the initialChar
	133	(if nothing else) as a string token. */
	134
	135	if (U_FAILURE(*status)) {
	136	return TOK_ERROR;
	137	}
	138
	139	/* setup */
	140	lastStringWasQuoted = FALSE;
	141	c = initialChar;
	142	ustr_setlen(token, 0, status);
	143
	144	if (U_FAILURE(*status)) {
	145	return TOK_ERROR;
	146	}
	147
	148	for (;;) {
	149	if (c == QUOTE) {
	150	if (!lastStringWasQuoted && token->fLength > 0) {
	151	ustr_ucat(token, SPACE, status);
	152
	153	if (U_FAILURE(*status)) {
	154	return TOK_ERROR;
	155	}
	156	}
	157
	158	lastStringWasQuoted = TRUE;
	159
	160	for (;;) {
	161	c = ucbuf_getc(buf,status);
	162
	163	/* EOF reached */
	164	if (c == U_EOF) {
	165	return TOK_EOF;
	166	}
	167
	168	/* Unterminated quoted strings */
	169	if (U_FAILURE(*status)) {
	170	return TOK_ERROR;
	171	}
	172
	173	if (c == QUOTE && !isFollowingCharEscaped) {
	174	break;
	175	}
	176
	177	if (c == ESCAPE && !isFollowingCharEscaped) {
	178	pTarget = target;
	179	c = unescape(buf, status);
	180
	181	if (c == U_ERR) {
	182	return TOK_ERROR;
	183	}
	184	if(c == CR \|\| c == LF){
	185	isNLUnescaped = TRUE;
	186	}
	187	}
	188
	189	if(c==ESCAPE && !isFollowingCharEscaped){
	190	isFollowingCharEscaped = TRUE;
	191	}else{
	192	U_APPEND_CHAR32(c, pTarget,len);
	193	pTarget = target;
	194	ustr_uscat(token, pTarget,len, status);
	195	isFollowingCharEscaped = FALSE;
	196	len=0;
	197	if(c == CR \|\| c == LF){
	198	if(isNLUnescaped == FALSE && prevC!=CR){
	199	lineCount++;
	200	}
	201	isNLUnescaped = FALSE;
	202	}
	203	}
	204
	205	if (U_FAILURE(*status)) {
	206	return TOK_ERROR;
	207	}
	208	prevC = c;
	209	}
	210	} else {
	211	if (token->fLength > 0) {
	212	ustr_ucat(token, SPACE, status);
	213
	214	if (U_FAILURE(*status)) {
	215	return TOK_ERROR;
	216	}
	217	}
	218
	219	if(lastStringWasQuoted){
	220	if(getShowWarning()){
	221	warning(lineCount, "Mixing quoted and unquoted strings");
	222	}
	223	if(isStrict()){
	224	return TOK_ERROR;
	225	}
	226
	227	}
	228
	229	lastStringWasQuoted = FALSE;
	230
	231	/* if we reach here we are mixing
	232	* quoted and unquoted strings
	233	* warn in normal mode and error in
	234	* pedantic mode
	235	*/
	236
	237	if (c == ESCAPE) {
	238	pTarget = target;
	239	c = unescape(buf, status);
	240
	241	/* EOF reached */
	242	if (c == U_EOF) {
	243	return TOK_ERROR;
	244	}
	245	}
	246
	247	U_APPEND_CHAR32(c, pTarget,len);
	248	pTarget = target;
	249	ustr_uscat(token, pTarget,len, status);
	250	len=0;
	251
	252	if (U_FAILURE(*status)) {
	253	return TOK_ERROR;
	254	}
	255
	256	for (;;) {
	257	/* DON'T skip whitespace */
	258	c = getNextChar(buf, FALSE, NULL, status);
	259
	260	/* EOF reached */
	261	if (c == U_EOF) {
	262	ucbuf_ungetc(c, buf);
	263	return TOK_STRING;
	264	}
	265
	266	if (U_FAILURE(*status)) {
	267	return TOK_STRING;
	268	}
	269
	270	if (c == QUOTE
	271	\|\| c == OPENBRACE
	272	\|\| c == CLOSEBRACE
	273	\|\| c == COMMA
	274	\|\| c == COLON) {
	275	ucbuf_ungetc(c, buf);
	276	break;
	277	}
	278
	279	if (isWhitespace(c)) {
	280	break;
	281	}
	282
	283	if (c == ESCAPE) {
	284	pTarget = target;
	285	c = unescape(buf, status);
	286
	287	if (c == U_ERR) {
	288	return TOK_ERROR;
	289	}
	290	}
	291
	292	U_APPEND_CHAR32(c, pTarget,len);
	293	pTarget = target;
	294	ustr_uscat(token, pTarget,len, status);
	295	len=0;
	296	if (U_FAILURE(*status)) {
	297	return TOK_ERROR;
	298	}
	299	}
	300	}
	301
	302	/* DO skip whitespace */
	303	c = getNextChar(buf, TRUE, NULL, status);
	304
	305	if (U_FAILURE(*status)) {
	306	return TOK_STRING;
	307	}
	308
	309	if (c == OPENBRACE \|\| c == CLOSEBRACE \|\| c == COMMA \|\| c == COLON) {
	310	ucbuf_ungetc(c, buf);
	311	return TOK_STRING;
	312	}
	313	}
	314	}
	315
	316	/* Retrieve the next character. If skipwhite is
	317	true, whitespace is skipped as well. */
	318	static UChar32 getNextChar(UCHARBUF* buf,
	319	UBool skipwhite,
	320	struct UString *token,
	321	UErrorCode *status) {
	322	UChar32 c, c2;
	323
	324	if (U_FAILURE(*status)) {
	325	return U_EOF;
	326	}
	327
	328	for (;;) {
	329	c = ucbuf_getc(buf,status);
	330
	331	if (c == U_EOF) {
	332	return U_EOF;
	333	}
	334
	335	if (skipwhite && isWhitespace(c)) {
	336	continue;
	337	}
	338
	339	/* This also handles the get() failing case */
	340	if (c != SLASH) {
	341	return c;
	342	}
	343
	344	c = ucbuf_getc(buf,status); /* "/c" */
	345
	346	if (c == U_EOF) {
	347	return U_EOF;
	348	}
	349
	350	switch (c) {
	351	case SLASH: /* "//" */
	352	seekUntilNewline(buf, NULL, status);
	353	break;
	354
	355	case ASTERISK: /* " / * " */
	356	c2 = ucbuf_getc(buf, status); /* "/ * c" */
	357	if(c2 == ASTERISK){ /* "/ * " /
	358	/* parse multi-line comment and store it in token*/
	359	seekUntilEndOfComment(buf, token, status);
	360	} else {
	361	ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ ". Include c2 back in buffer. /
	362	seekUntilEndOfComment(buf, NULL, status);
	363	}
	364	break;
	365
	366	default:
	367	ucbuf_ungetc(c, buf); /* "/c" - put back the c */
	368	/* If get() failed this is a NOP */
	369	return SLASH;
	370	}
	371
	372	}
	373	}
	374
	375	static void seekUntilNewline(UCHARBUF* buf,
	376	struct UString *token,
	377	UErrorCode *status) {
	378	UChar32 c;
	379
	380	if (U_FAILURE(*status)) {
	381	return;
	382	}
	383
	384	do {
	385	c = ucbuf_getc(buf,status);
	386	/* add the char to token */
	387	if(token!=NULL){
	388	ustr_u32cat(token, c, status);
	389	}
	390	} while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
	391	}
	392
	393	static void seekUntilEndOfComment(UCHARBUF *buf,
	394	struct UString *token,
	395	UErrorCode *status) {
	396	UChar32 c, d;
	397	uint32_t line;
	398
	399	if (U_FAILURE(*status)) {
	400	return;
	401	}
	402
	403	line = lineCount;
	404
	405	do {
	406	c = ucbuf_getc(buf, status);
	407
	408	if (c == ASTERISK) {
	409	d = ucbuf_getc(buf, status);
	410
	411	if (d != SLASH) {
	412	ucbuf_ungetc(d, buf);
	413	} else {
	414	break;
	415	}
	416	}
	417	/* add the char to token */
	418	if(token!=NULL){
	419	ustr_u32cat(token, c, status);
	420	}
	421	/* increment the lineCount */
	422	isNewline(c);
	423
	424	} while (c != U_EOF && *status == U_ZERO_ERROR);
	425
	426	if (c == U_EOF) {
	427	*status = U_INVALID_FORMAT_ERROR;
	428	error(line, "unterminated comment detected");
	429	}
	430	}
	431
	432	U_CFUNC UChar32 unescape(UCHARBUF buf, UErrorCode status) {
	433	if (U_FAILURE(*status)) {
	434	return U_EOF;
	435	}
	436
	437	/* We expect to be called after the ESCAPE has been seen, but
	438	* u_fgetcx needs an ESCAPE to do its magic. */
	439	ucbuf_ungetc(ESCAPE, buf);
	440
	441	return ucbuf_getcx32(buf, status);
	442	}
	443
	444	static UBool isWhitespace(UChar32 c) {
	445	switch (c) {
	446	/* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
	447	case 0x000A:
	448	case 0x2029:
	449	lineCount++;
	450	case 0x000D:
	451	case 0x0020:
	452	case 0x0009:
	453	case 0xFEFF:
	454	return TRUE;
	455
	456	default:
	457	return FALSE;
	458	}
	459	}
	460
	461	static UBool isNewline(UChar32 c) {
	462	switch (c) {
	463	/* '\n', '\r', 0x2029 */
	464	case 0x000A:
	465	case 0x2029:
	466	lineCount++;
	467	case 0x000D:
	468	return TRUE;
	469
	470	default:
	471	return FALSE;
	472	}
	473	}