[apple/icu.git] / icuSources / test / intltest / tokiter.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (c) 2004-2011, International Business Machines
* Corporation and others.  All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: March 22 2004
* Since: ICU 3.0
**********************************************************************
*/
#include "tokiter.h"
#include "textfile.h"
#include "patternprops.h"
#include "util.h"
#include "uprops.h"

TokenIterator::TokenIterator(TextFile* r) {
    reader = r;
    done = haveLine = FALSE;
    pos = lastpos = -1;
}

TokenIterator::~TokenIterator() {
}

UBool TokenIterator::next(UnicodeString& token, UErrorCode& ec) {
    if (done || U_FAILURE(ec)) {
        return FALSE;
    }
    token.truncate(0);
    for (;;) {
        if (!haveLine) {
            if (!reader->readLineSkippingComments(line, ec)) {
                done = TRUE;
                return FALSE;
            }
            haveLine = TRUE;
            pos = 0;
        }
        lastpos = pos;
        if (!nextToken(token, ec)) {
            haveLine = FALSE;
            if (U_FAILURE(ec)) return FALSE;
            continue;
        }
        return TRUE;
    }
}

int32_t TokenIterator::getLineNumber() const {
    return reader->getLineNumber();
}

/**
 * Read the next token from 'this->line' and append it to 'token'.
 * Tokens are separated by Pattern_White_Space.  Tokens may also be
 * delimited by double or single quotes.  The closing quote must match
 * the opening quote.  If a '#' is encountered, the rest of the line
 * is ignored, unless it is backslash-escaped or within quotes.
 * @param token the token is appended to this StringBuffer
 * @param ec input-output error code
 * @return TRUE if a valid token is found, or FALSE if the end
 * of the line is reached or an error occurs
 */
UBool TokenIterator::nextToken(UnicodeString& token, UErrorCode& ec) {
    ICU_Utility::skipWhitespace(line, pos, TRUE);
    if (pos == line.length()) {
        return FALSE;
    }
    UChar c = line.charAt(pos++);
    UChar quote = 0;
    switch (c) {
    case 34/*'"'*/:
    case 39/*'\\'*/:
        quote = c;
        break;
    case 35/*'#'*/:
        return FALSE;
    default:
        token.append(c);
        break;
    }
    while (pos < line.length()) {
        c = line.charAt(pos); // 16-bit ok
        if (c == 92/*'\\'*/) {
            UChar32 c32 = line.unescapeAt(pos);
            if (c32 < 0) {
                ec = U_MALFORMED_UNICODE_ESCAPE;
                return FALSE;
            }
            token.append(c32);
        } else if ((quote != 0 && c == quote) ||
                   (quote == 0 && PatternProps::isWhiteSpace(c))) {
            ++pos;
            return TRUE;
        } else if (quote == 0 && c == '#') {
            return TRUE; // do NOT increment
        } else {
            token.append(c);
            ++pos;
        }
    }
    if (quote != 0) {
        ec = U_UNTERMINATED_QUOTE;
        return FALSE;
    }
    return TRUE;
}
Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
374ca955 A	3	/*
374ca955 A	4	**********************************************************************
4388f060	5	* Copyright (c) 2004-2011, International Business Machines
374ca955 A	6	* Corporation and others. All Rights Reserved.
	7	**********************************************************************
	8	* Author: Alan Liu
	9	* Created: March 22 2004
	10	* Since: ICU 3.0
	11	**********************************************************************
	12	*/
	13	#include "tokiter.h"
	14	#include "textfile.h"
4388f060	15	#include "patternprops.h"
374ca955 A	16	#include "util.h"
	17	#include "uprops.h"
	18
	19	TokenIterator::TokenIterator(TextFile* r) {
	20	reader = r;
	21	done = haveLine = FALSE;
	22	pos = lastpos = -1;
	23	}
	24
	25	TokenIterator::~TokenIterator() {
	26	}
	27
	28	UBool TokenIterator::next(UnicodeString& token, UErrorCode& ec) {
	29	if (done \|\| U_FAILURE(ec)) {
	30	return FALSE;
	31	}
	32	token.truncate(0);
	33	for (;;) {
	34	if (!haveLine) {
	35	if (!reader->readLineSkippingComments(line, ec)) {
	36	done = TRUE;
	37	return FALSE;
	38	}
	39	haveLine = TRUE;
	40	pos = 0;
	41	}
	42	lastpos = pos;
	43	if (!nextToken(token, ec)) {
	44	haveLine = FALSE;
	45	if (U_FAILURE(ec)) return FALSE;
	46	continue;
	47	}
	48	return TRUE;
	49	}
	50	}
	51
	52	int32_t TokenIterator::getLineNumber() const {
	53	return reader->getLineNumber();
	54	}
	55
	56	/**
	57	* Read the next token from 'this->line' and append it to 'token'.
4388f060	58	* Tokens are separated by Pattern_White_Space. Tokens may also be
374ca955 A	59	* delimited by double or single quotes. The closing quote must match
	60	* the opening quote. If a '#' is encountered, the rest of the line
	61	* is ignored, unless it is backslash-escaped or within quotes.
	62	* @param token the token is appended to this StringBuffer
	63	* @param ec input-output error code
	64	* @return TRUE if a valid token is found, or FALSE if the end
	65	* of the line is reached or an error occurs
	66	*/
	67	UBool TokenIterator::nextToken(UnicodeString& token, UErrorCode& ec) {
	68	ICU_Utility::skipWhitespace(line, pos, TRUE);
	69	if (pos == line.length()) {
	70	return FALSE;
	71	}
	72	UChar c = line.charAt(pos++);
	73	UChar quote = 0;
	74	switch (c) {
	75	case 34/'"'/:
	76	case 39/'\\'/:
	77	quote = c;
	78	break;
	79	case 35/'#'/:
	80	return FALSE;
	81	default:
	82	token.append(c);
	83	break;
	84	}
	85	while (pos < line.length()) {
	86	c = line.charAt(pos); // 16-bit ok
	87	if (c == 92/'\\'/) {
	88	UChar32 c32 = line.unescapeAt(pos);
	89	if (c32 < 0) {
	90	ec = U_MALFORMED_UNICODE_ESCAPE;
	91	return FALSE;
	92	}
	93	token.append(c32);
	94	} else if ((quote != 0 && c == quote) \|\|
4388f060	95	(quote == 0 && PatternProps::isWhiteSpace(c))) {
374ca955 A	96	++pos;
	97	return TRUE;
	98	} else if (quote == 0 && c == '#') {
	99	return TRUE; // do NOT increment
	100	} else {
	101	token.append(c);
	102	++pos;
	103	}
	104	}
	105	if (quote != 0) {
	106	ec = U_UNTERMINATED_QUOTE;
	107	return FALSE;
	108	}
	109	return TRUE;
	110	}