[apple/icu.git] / icuSources / i18n / strmatch.h

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 * Copyright (C) 2001-2011, International Business Machines Corporation
 * and others. All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
 *   07/23/01    aliu        Creation.
 **********************************************************************
 */
#ifndef STRMATCH_H
#define STRMATCH_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/unistr.h"
#include "unicode/unifunct.h"
#include "unicode/unimatch.h"
#include "unicode/unirepl.h"

U_NAMESPACE_BEGIN

class TransliterationRuleData;

/**
 * An object that matches a fixed input string, implementing the
 * UnicodeMatcher API.  This object also implements the
 * UnicodeReplacer API, allowing it to emit the matched text as
 * output.  Since the match text may contain flexible match elements,
 * such as UnicodeSets, the emitted text is not the match pattern, but
 * instead a substring of the actual matched text.  Following
 * convention, the output text is the leftmost match seen up to this
 * point.
 *
 * A StringMatcher may represent a segment, in which case it has a
 * positive segment number.  This affects how the matcher converts
 * itself to a pattern but does not otherwise affect its function.
 *
 * A StringMatcher that is not a segment should not be used as a
 * UnicodeReplacer.
 */
class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {

 public:

    /**
     * Construct a matcher that matches the given pattern string.
     * @param string the pattern to be matched, possibly containing
     * stand-ins that represent nested UnicodeMatcher objects.
     * @param start inclusive start index of text to be replaced
     * @param limit exclusive end index of text to be replaced;
     * must be greater than or equal to start
     * @param segmentNum the segment number from 1..n, or 0 if this is
     * not a segment.
     * @param data context object mapping stand-ins to
     * UnicodeMatcher objects.
     */
    StringMatcher(const UnicodeString& string,
                  int32_t start,
                  int32_t limit,
                  int32_t segmentNum,
                  const TransliterationRuleData& data);

    /**
     * Copy constructor
     * @param o  the object to be copied.
     */
    StringMatcher(const StringMatcher& o);
        
    /**
     * Destructor
     */
    virtual ~StringMatcher();

    /**
     * Implement UnicodeFunctor
     * @return a copy of the object.
     */
    virtual UnicodeFunctor* clone() const;

    /**
     * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
     * and return the pointer.
     * @return the UnicodeMatcher point.
     */
    virtual UnicodeMatcher* toMatcher() const;

    /**
     * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
     * and return the pointer.
     * @return the UnicodeReplacer pointer.
     */
    virtual UnicodeReplacer* toReplacer() const;

    /**
     * Implement UnicodeMatcher
     * @param text the text to be matched
     * @param offset on input, the index into text at which to begin
     * matching.  On output, the limit of the matched text.  The
     * number of matched characters is the output value of offset
     * minus the input value.  Offset should always point to the
     * HIGH SURROGATE (leading code unit) of a pair of surrogates,
     * both on entry and upon return.
     * @param limit the limit index of text to be matched.  Greater
     * than offset for a forward direction match, less than offset for
     * a backward direction match.  The last character to be
     * considered for matching will be text.charAt(limit-1) in the
     * forward direction or text.charAt(limit+1) in the backward
     * direction.
     * @param incremental  if TRUE, then assume further characters may
     * be inserted at limit and check for partial matching.  Otherwise
     * assume the text as given is complete.
     * @return a match degree value indicating a full match, a partial
     * match, or a mismatch.  If incremental is FALSE then
     * U_PARTIAL_MATCH should never be returned.
     */
    virtual UMatchDegree matches(const Replaceable& text,
                                 int32_t& offset,
                                 int32_t limit,
                                 UBool incremental);

    /**
     * Implement UnicodeMatcher
     * @param result            Output param to receive the pattern.
     * @param escapeUnprintable if True then escape the unprintable characters.
     * @return                  A reference to 'result'.
     */
    virtual UnicodeString& toPattern(UnicodeString& result,
                                     UBool escapeUnprintable = FALSE) const;

    /**
     * Implement UnicodeMatcher
     * Returns TRUE if this matcher will match a character c, where c
     * & 0xFF == v, at offset, in the forward direction (with limit >
     * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
     * indexing.
     * @param v    the given value
     * @return     TRUE if this matcher will match a character c, 
     *             where c & 0xFF == v
     */
    virtual UBool matchesIndexValue(uint8_t v) const;

    /**
     * Implement UnicodeMatcher
     */
    virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;

    /**
     * Implement UnicodeFunctor
     */
    virtual void setData(const TransliterationRuleData*);

    /**
     * Replace characters in 'text' from 'start' to 'limit' with the
     * output text of this object.  Update the 'cursor' parameter to
     * give the cursor position and return the length of the
     * replacement text.
     *
     * @param text the text to be matched
     * @param start inclusive start index of text to be replaced
     * @param limit exclusive end index of text to be replaced;
     * must be greater than or equal to start
     * @param cursor output parameter for the cursor position.
     * Not all replacer objects will update this, but in a complete
     * tree of replacer objects, representing the entire output side
     * of a transliteration rule, at least one must update it.
     * @return the number of 16-bit code units in the text replacing
     * the characters at offsets start..(limit-1) in text
     */
    virtual int32_t replace(Replaceable& text,
                            int32_t start,
                            int32_t limit,
                            int32_t& cursor);

    /**
     * Returns a string representation of this replacer.  If the
     * result of calling this function is passed to the appropriate
     * parser, typically TransliteratorParser, it will produce another
     * replacer that is equal to this one.
     * @param result the string to receive the pattern.  Previous
     * contents will be deleted.
     * @param escapeUnprintable if TRUE then convert unprintable
     * character to their hex escape representations, \\uxxxx or
     * \\Uxxxxxxxx.  Unprintable characters are defined by
     * Utility.isUnprintable().
     * @return a reference to 'result'.
     */
    virtual UnicodeString& toReplacerPattern(UnicodeString& result,
                                             UBool escapeUnprintable) const;

    /**
     * Remove any match data.  This must be called before performing a
     * set of matches with this segment.
     */
    void resetMatch();

    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     */
    virtual UClassID getDynamicClassID() const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     */
    static UClassID U_EXPORT2 getStaticClassID();

    /**
     * Union the set of all characters that may output by this object
     * into the given set.
     * @param toUnionTo the set into which to union the output characters
     */
    virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;

 private:

    /**
     * The text to be matched.
     */
    UnicodeString pattern;

    /**
     * Context object that maps stand-ins to matcher and replacer
     * objects.
     */
    const TransliterationRuleData* data;

    /**
     * The segment number, 1-based, or 0 if not a segment.
     */
    int32_t segmentNumber;

    /**
     * Start offset, in the match text, of the <em>rightmost</em>
     * match.
     */
    int32_t matchStart;

    /**
     * Limit offset, in the match text, of the <em>rightmost</em>
     * match.
     */
    int32_t matchLimit;

};

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif
Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f	3	/*
4388f060	4	* Copyright (C) 2001-2011, International Business Machines Corporation
374ca955 A	5	* and others. All Rights Reserved.
	6	**********************************************************************
	7	* Date Name Description
	8	* 07/23/01 aliu Creation.
	9	**********************************************************************
	10	*/
b75a7d8f A	11	#ifndef STRMATCH_H
	12	#define STRMATCH_H
	13
	14	#include "unicode/utypes.h"
	15
	16	#if !UCONFIG_NO_TRANSLITERATION
	17
	18	#include "unicode/unistr.h"
	19	#include "unicode/unifunct.h"
	20	#include "unicode/unimatch.h"
	21	#include "unicode/unirepl.h"
	22
	23	U_NAMESPACE_BEGIN
	24
	25	class TransliterationRuleData;
	26
	27	/**
	28	* An object that matches a fixed input string, implementing the
	29	* UnicodeMatcher API. This object also implements the
	30	* UnicodeReplacer API, allowing it to emit the matched text as
	31	* output. Since the match text may contain flexible match elements,
	32	* such as UnicodeSets, the emitted text is not the match pattern, but
	33	* instead a substring of the actual matched text. Following
	34	* convention, the output text is the leftmost match seen up to this
	35	* point.
	36	*
	37	* A StringMatcher may represent a segment, in which case it has a
	38	* positive segment number. This affects how the matcher converts
	39	* itself to a pattern but does not otherwise affect its function.
	40	*
	41	* A StringMatcher that is not a segment should not be used as a
	42	* UnicodeReplacer.
	43	*/
	44	class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
	45
	46	public:
	47
	48	/**
	49	* Construct a matcher that matches the given pattern string.
	50	* @param string the pattern to be matched, possibly containing
	51	* stand-ins that represent nested UnicodeMatcher objects.
	52	* @param start inclusive start index of text to be replaced
	53	* @param limit exclusive end index of text to be replaced;
	54	* must be greater than or equal to start
	55	* @param segmentNum the segment number from 1..n, or 0 if this is
	56	* not a segment.
	57	* @param data context object mapping stand-ins to
	58	* UnicodeMatcher objects.
	59	*/
	60	StringMatcher(const UnicodeString& string,
	61	int32_t start,
	62	int32_t limit,
	63	int32_t segmentNum,
	64	const TransliterationRuleData& data);
	65
	66	/**
	67	* Copy constructor
	68	* @param o the object to be copied.
	69	*/
	70	StringMatcher(const StringMatcher& o);
	71
	72	/**
	73	* Destructor
	74	*/
75	virtual ~StringMatcher();
76
77	/**
78	* Implement UnicodeFunctor
79	* @return a copy of the object.
80	*/
81	virtual UnicodeFunctor* clone() const;
82
83	/**
84	* UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
85	* and return the pointer.
86	* @return the UnicodeMatcher point.
87	*/
88	virtual UnicodeMatcher* toMatcher() const;
89
90	/**
91	* UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
92	* and return the pointer.
93	* @return the UnicodeReplacer pointer.
94	*/
95	virtual UnicodeReplacer* toReplacer() const;
96
97	/**
98	* Implement UnicodeMatcher
99	* @param text the text to be matched
100	* @param offset on input, the index into text at which to begin
101	* matching. On output, the limit of the matched text. The
102	* number of matched characters is the output value of offset
103	* minus the input value. Offset should always point to the
104	* HIGH SURROGATE (leading code unit) of a pair of surrogates,
105	* both on entry and upon return.
106	* @param limit the limit index of text to be matched. Greater
107	* than offset for a forward direction match, less than offset for
108	* a backward direction match. The last character to be
109	* considered for matching will be text.charAt(limit-1) in the
110	* forward direction or text.charAt(limit+1) in the backward
111	* direction.
112	* @param incremental if TRUE, then assume further characters may
113	* be inserted at limit and check for partial matching. Otherwise
114	* assume the text as given is complete.
115	* @return a match degree value indicating a full match, a partial
116	* match, or a mismatch. If incremental is FALSE then
117	* U_PARTIAL_MATCH should never be returned.
118	*/
119	virtual UMatchDegree matches(const Replaceable& text,
120	int32_t& offset,
121	int32_t limit,
122	UBool incremental);
123
124	/**
125	* Implement UnicodeMatcher
126	* @param result Output param to receive the pattern.
127	* @param escapeUnprintable if True then escape the unprintable characters.
128	* @return A reference to 'result'.
129	*/
130	virtual UnicodeString& toPattern(UnicodeString& result,
131	UBool escapeUnprintable = FALSE) const;
132
133	/**
134	* Implement UnicodeMatcher
135	* Returns TRUE if this matcher will match a character c, where c
136	* & 0xFF == v, at offset, in the forward direction (with limit >
137	* offset). This is used by <tt>RuleBasedTransliterator</tt> for
138	* indexing.
139	* @param v the given value
140	* @return TRUE if this matcher will match a character c,
141	* where c & 0xFF == v
142	*/
143	virtual UBool matchesIndexValue(uint8_t v) const;
144
145	/**
146	* Implement UnicodeMatcher
147	*/
148	virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
149
150	/**
151	* Implement UnicodeFunctor
152	*/
153	virtual void setData(const TransliterationRuleData*);
154
155	/**
156	* Replace characters in 'text' from 'start' to 'limit' with the
157	* output text of this object. Update the 'cursor' parameter to
158	* give the cursor position and return the length of the
159	* replacement text.
160	*
161	* @param text the text to be matched
162	* @param start inclusive start index of text to be replaced
163	* @param limit exclusive end index of text to be replaced;
164	* must be greater than or equal to start
165	* @param cursor output parameter for the cursor position.
166	* Not all replacer objects will update this, but in a complete
167	* tree of replacer objects, representing the entire output side
168	* of a transliteration rule, at least one must update it.
169	* @return the number of 16-bit code units in the text replacing
170	* the characters at offsets start..(limit-1) in text
171	*/
172	virtual int32_t replace(Replaceable& text,
173	int32_t start,
174	int32_t limit,
175	int32_t& cursor);
176
177	/**
178	* Returns a string representation of this replacer. If the
179	* result of calling this function is passed to the appropriate
180	* parser, typically TransliteratorParser, it will produce another
181	* replacer that is equal to this one.
182	* @param result the string to receive the pattern. Previous
183	* contents will be deleted.
184	* @param escapeUnprintable if TRUE then convert unprintable
185	* character to their hex escape representations, \\uxxxx or
186	* \\Uxxxxxxxx. Unprintable characters are defined by
187	* Utility.isUnprintable().
188	* @return a reference to 'result'.
189	*/
190	virtual UnicodeString& toReplacerPattern(UnicodeString& result,
191	UBool escapeUnprintable) const;
192
193	/**
194	* Remove any match data. This must be called before performing a
195	* set of matches with this segment.
196	*/
197	void resetMatch();
198
199	/**
200	* ICU "poor man's RTTI", returns a UClassID for the actual class.
b75a7d8f	201	*/
374ca955	202	virtual UClassID getDynamicClassID() const;
b75a7d8f A	203
	204	/**
	205	* ICU "poor man's RTTI", returns a UClassID for this class.
b75a7d8f	206	*/
374ca955	207	static UClassID U_EXPORT2 getStaticClassID();
b75a7d8f A	208
	209	/**
	210	* Union the set of all characters that may output by this object
	211	* into the given set.
	212	* @param toUnionTo the set into which to union the output characters
	213	*/
	214	virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
	215
	216	private:
	217
	218	/**
	219	* The text to be matched.
	220	*/
	221	UnicodeString pattern;
	222
	223	/**
	224	* Context object that maps stand-ins to matcher and replacer
	225	* objects.
	226	*/
	227	const TransliterationRuleData* data;
	228
	229	/**
	230	* The segment number, 1-based, or 0 if not a segment.
	231	*/
	232	int32_t segmentNumber;
	233
	234	/**
	235	* Start offset, in the match text, of the <em>rightmost</em>
	236	* match.
	237	*/
	238	int32_t matchStart;
	239
	240	/**
	241	* Limit offset, in the match text, of the <em>rightmost</em>
	242	* match.
	243	*/
	244	int32_t matchLimit;
	245
b75a7d8f A	246	};
	247
	248	U_NAMESPACE_END
	249
	250	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
	251
	252	#endif