[apple/icu.git] / icuSources / i18n / usrchimp.h

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
*   Copyright (C) 2001-2015 IBM and others. All rights reserved.
**********************************************************************
*   Date        Name        Description
*  08/13/2001   synwee      Creation.
**********************************************************************
*/
#ifndef USRCHIMP_H
#define USRCHIMP_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/normalizer2.h"
#include "unicode/ucol.h"
#include "unicode/ucoleitr.h"
#include "unicode/ubrk.h"

/* mask off anything but primary order */
#define UCOL_PRIMARYORDERMASK 0xffff0000
/* mask off anything but secondary order */
#define UCOL_SECONDARYORDERMASK 0x0000ff00
/* mask off anything but tertiary order */
#define UCOL_TERTIARYORDERMASK 0x000000ff
/* primary order shift */
#define UCOL_PRIMARYORDERSHIFT 16
/* secondary order shift */
#define UCOL_SECONDARYORDERSHIFT 8

#define UCOL_IGNORABLE 0

/* get weights from a CE */
#define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)

#define UCOL_CONTINUATION_MARKER 0xC0

#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)

/**
 * This indicates an error has occured during processing or there are no more CEs 
 * to be returned.
 */
#ifndef UCOL_PROCESSED_NULLORDER
#define UCOL_PROCESSED_NULLORDER        ((int64_t)U_INT64_MAX)
#endif

U_NAMESPACE_BEGIN

class CollationElementIterator;
class Collator;

struct PCEI
{
    uint64_t ce;
    int32_t  low;
    int32_t  high;
};

struct PCEBuffer
{
    PCEI    defaultBuffer[16];
    PCEI   *buffer;
    int32_t bufferIndex;
    int32_t bufferSize;

    PCEBuffer();
    ~PCEBuffer();

    void  reset();
    UBool isEmpty() const;
    void  put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
    const PCEI *get();
};

class UCollationPCE : public UMemory {
private:
    PCEBuffer          pceBuffer;
    CollationElementIterator *cei;
    UCollationStrength strength;
    UBool              toShift;
    UBool              isShifted;
    uint32_t           variableTop;

public:
    UCollationPCE(UCollationElements *elems);
    UCollationPCE(CollationElementIterator *iter);
    ~UCollationPCE();

    void init(UCollationElements *elems);
    void init(CollationElementIterator *iter);

    /**
     * Get the processed ordering priority of the next collation element in the text.
     * A single character may contain more than one collation element.
     *
     * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
     * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
     * @param status A pointer to an UErrorCode to receive any errors.
     * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER 
     *         if an error has occured or if the end of string has been reached
     */
    int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
    /**
     * Get the processed ordering priority of the previous collation element in the text.
     * A single character may contain more than one collation element.
     *
     * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
     * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
     * @param status A pointer to an UErrorCode to receive any errors. Noteably 
     *               a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
     *               buffer has been exhausted.
     * @return The previous collation elements ordering, otherwise returns 
     *         UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
     *         string has been reached.
     */
    int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);

private:
    void init(const Collator &coll);
    uint64_t processCE(uint32_t ce);
};

U_NAMESPACE_END

#define INITIAL_ARRAY_SIZE_       256
#define MAX_TABLE_SIZE_           257

struct USearch {
    // required since collation element iterator does not have a getText API
    const UChar              *text;
          int32_t             textLength; // exact length
          UBool               isOverlap;
          UBool               isCanonicalMatch;
          int16_t             elementComparisonType;
          UBreakIterator     *internalBreakIter;  //internal character breakiterator
          UBreakIterator     *breakIter;
    // value USEARCH_DONE is the default value
    // if we are not at the start of the text or the end of the text, 
    // depending on the iteration direction and matchedIndex is USEARCH_DONE 
    // it means that we can't find any more matches in that particular direction
          int32_t             matchedIndex; 
          int32_t             matchedLength;
          UBool               isForwardSearching;
          UBool               reset;
};

struct UPattern {
    const UChar              *text;
          int32_t             textLength; // exact length
          // length required for backwards ce comparison
          int32_t             cesLength;
          int32_t            *ces;
          int32_t             cesBuffer[INITIAL_ARRAY_SIZE_];
          int32_t             pcesLength;
          int64_t            *pces;
          int64_t             pcesBuffer[INITIAL_ARRAY_SIZE_];
          UBool               hasPrefixAccents;
          UBool               hasSuffixAccents;
          int16_t             defaultShiftSize;
          int16_t             shift[MAX_TABLE_SIZE_];
          int16_t             backShift[MAX_TABLE_SIZE_];
};

struct UStringSearch {
    struct USearch            *search;
    struct UPattern            pattern;
    const  UCollator          *collator;
    const  icu::Normalizer2   *nfd;
    // positions within the collation element iterator is used to determine
    // if we are at the start of the text.
           UCollationElements *textIter;
           icu::UCollationPCE *textProcessedIter;
    // utility collation element, used throughout program for temporary 
    // iteration.
           UCollationElements *utilIter;
           UBool               ownCollator;
           UCollationStrength  strength;
           uint32_t            ceMask;
           uint32_t            variableTop;
           UBool               toShift;
           UChar               canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
           UChar               canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
};

/**
* Exact matches without checking for the ends for extra accents.
* The match after the position within the collation element iterator is to be
* found. 
* After a match is found the offset in the collation element iterator will be
* shifted to the start of the match.
* Implementation note: 
* For tertiary we can't use the collator->tertiaryMask, that is a 
* preprocessed mask that takes into account case options. since we are only 
* concerned with exact matches, we don't need that.
* Alternate handling - since only the 16 most significant digits is only used, 
* we can safely do a compare without masking if the ce is a variable, we mask 
* and get only the primary values no shifting to quartenary is required since 
* all primary values less than variabletop will need to be masked off anyway.
* If the end character is composite and the pattern ce does not match the text 
* ce, we skip it until we find a match in the end composite character or when 
* it has passed the character. This is so that we can match pattern "a" with
* the text "\u00e6" 
* @param strsrch string search data
* @param status error status if any
* @return TRUE if an exact match is found, FALSE otherwise
*/
U_CFUNC
UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);

/**
* Canonical matches.
* According to the definition, matches found here will include the whole span 
* of beginning and ending accents if it overlaps that region.
* @param strsrch string search data
* @param status error status if any
* @return TRUE if a canonical match is found, FALSE otherwise
*/
U_CFUNC
UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);

/**
* Gets the previous match.
* Comments follows from handleNextExact
* @param strsrch string search data
* @param status error status if any
* @return True if a exact math is found, FALSE otherwise.
*/
U_CFUNC
UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);

/**
* Canonical matches.
* According to the definition, matches found here will include the whole span 
* of beginning and ending accents if it overlaps that region.
* @param strsrch string search data
* @param status error status if any
* @return TRUE if a canonical match is found, FALSE otherwise
*/
U_CFUNC
UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 
                                      UErrorCode    *status);

#endif /* #if !UCONFIG_NO_COLLATION */

#endif
Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f A	3	/*
b75a7d8f A	4	**********************************************************************
2ca993e8	5	* Copyright (C) 2001-2015 IBM and others. All rights reserved.
b75a7d8f A	6	**********************************************************************
	7	* Date Name Description
	8	* 08/13/2001 synwee Creation.
	9	**********************************************************************
	10	*/
	11	#ifndef USRCHIMP_H
	12	#define USRCHIMP_H
	13
	14	#include "unicode/utypes.h"
	15
	16	#if !UCONFIG_NO_COLLATION
	17
729e4ab9	18	#include "unicode/normalizer2.h"
b75a7d8f A	19	#include "unicode/ucol.h"
	20	#include "unicode/ucoleitr.h"
	21	#include "unicode/ubrk.h"
	22
57a6839d A	23	/* mask off anything but primary order */
	24	#define UCOL_PRIMARYORDERMASK 0xffff0000
	25	/* mask off anything but secondary order */
	26	#define UCOL_SECONDARYORDERMASK 0x0000ff00
	27	/* mask off anything but tertiary order */
	28	#define UCOL_TERTIARYORDERMASK 0x000000ff
	29	/* primary order shift */
	30	#define UCOL_PRIMARYORDERSHIFT 16
	31	/* secondary order shift */
	32	#define UCOL_SECONDARYORDERSHIFT 8
	33
	34	#define UCOL_IGNORABLE 0
	35
	36	/* get weights from a CE */
	37	#define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
	38	#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
	39	#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
	40
	41	#define UCOL_CONTINUATION_MARKER 0xC0
	42
	43	#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
	44
	45	/**
	46	* This indicates an error has occured during processing or there are no more CEs
	47	* to be returned.
	48	*/
	49	#ifndef UCOL_PROCESSED_NULLORDER
	50	#define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)
	51	#endif
	52
	53	U_NAMESPACE_BEGIN
	54
	55	class CollationElementIterator;
	56	class Collator;
	57
	58	struct PCEI
	59	{
	60	uint64_t ce;
	61	int32_t low;
	62	int32_t high;
	63	};
	64
	65	struct PCEBuffer
	66	{
	67	PCEI defaultBuffer[16];
	68	PCEI *buffer;
	69	int32_t bufferIndex;
	70	int32_t bufferSize;
	71
	72	PCEBuffer();
	73	~PCEBuffer();
	74
	75	void reset();
2ca993e8 A	76	UBool isEmpty() const;
2ca993e8 A	77	void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
57a6839d A	78	const PCEI *get();
	79	};
	80
	81	class UCollationPCE : public UMemory {
	82	private:
	83	PCEBuffer pceBuffer;
	84	CollationElementIterator *cei;
	85	UCollationStrength strength;
	86	UBool toShift;
	87	UBool isShifted;
	88	uint32_t variableTop;
	89
	90	public:
	91	UCollationPCE(UCollationElements *elems);
	92	UCollationPCE(CollationElementIterator *iter);
	93	~UCollationPCE();
	94
	95	void init(UCollationElements *elems);
	96	void init(CollationElementIterator *iter);
	97
	98	/**
	99	* Get the processed ordering priority of the next collation element in the text.
	100	* A single character may contain more than one collation element.
	101	*
	102	* @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
	103	* @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
	104	* @param status A pointer to an UErrorCode to receive any errors.
	105	* @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
	106	* if an error has occured or if the end of string has been reached
	107	*/
	108	int64_t nextProcessed(int32_t ixLow, int32_t ixHigh, UErrorCode *status);
	109	/**
	110	* Get the processed ordering priority of the previous collation element in the text.
	111	* A single character may contain more than one collation element.
	112	*
	113	* @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
	114	* @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
	115	* @param status A pointer to an UErrorCode to receive any errors. Noteably
	116	* a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
	117	* buffer has been exhausted.
	118	* @return The previous collation elements ordering, otherwise returns
	119	* UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
	120	* string has been reached.
	121	*/
	122	int64_t previousProcessed(int32_t ixLow, int32_t ixHigh, UErrorCode *status);
	123
	124	private:
	125	void init(const Collator &coll);
	126	uint64_t processCE(uint32_t ce);
	127	};
	128
	129	U_NAMESPACE_END
	130
b75a7d8f A	131	#define INITIAL_ARRAY_SIZE_ 256
	132	#define MAX_TABLE_SIZE_ 257
	133
	134	struct USearch {
	135	// required since collation element iterator does not have a getText API
	136	const UChar *text;
	137	int32_t textLength; // exact length
	138	UBool isOverlap;
	139	UBool isCanonicalMatch;
729e4ab9 A	140	int16_t elementComparisonType;
729e4ab9 A	141	UBreakIterator *internalBreakIter; //internal character breakiterator
b75a7d8f A	142	UBreakIterator *breakIter;
	143	// value USEARCH_DONE is the default value
	144	// if we are not at the start of the text or the end of the text,
	145	// depending on the iteration direction and matchedIndex is USEARCH_DONE
46f4442e A	146	// it means that we can't find any more matches in that particular direction
46f4442e A	147	int32_t matchedIndex;
b75a7d8f A	148	int32_t matchedLength;
	149	UBool isForwardSearching;
	150	UBool reset;
	151	};
	152
	153	struct UPattern {
	154	const UChar *text;
	155	int32_t textLength; // exact length
	156	// length required for backwards ce comparison
b331163b A	157	int32_t cesLength;
	158	int32_t *ces;
	159	int32_t cesBuffer[INITIAL_ARRAY_SIZE_];
	160	int32_t pcesLength;
	161	int64_t *pces;
	162	int64_t pcesBuffer[INITIAL_ARRAY_SIZE_];
b75a7d8f A	163	UBool hasPrefixAccents;
	164	UBool hasSuffixAccents;
	165	int16_t defaultShiftSize;
	166	int16_t shift[MAX_TABLE_SIZE_];
	167	int16_t backShift[MAX_TABLE_SIZE_];
	168	};
	169
	170	struct UStringSearch {
	171	struct USearch *search;
	172	struct UPattern pattern;
	173	const UCollator *collator;
4388f060	174	const icu::Normalizer2 *nfd;
b75a7d8f A	175	// positions within the collation element iterator is used to determine
	176	// if we are at the start of the text.
	177	UCollationElements *textIter;
57a6839d	178	icu::UCollationPCE *textProcessedIter;
b75a7d8f A	179	// utility collation element, used throughout program for temporary
	180	// iteration.
	181	UCollationElements *utilIter;
	182	UBool ownCollator;
	183	UCollationStrength strength;
	184	uint32_t ceMask;
	185	uint32_t variableTop;
	186	UBool toShift;
	187	UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
	188	UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
	189	};
	190
	191	/**
	192	* Exact matches without checking for the ends for extra accents.
	193	* The match after the position within the collation element iterator is to be
	194	* found.
	195	* After a match is found the offset in the collation element iterator will be
	196	* shifted to the start of the match.
	197	* Implementation note:
	198	* For tertiary we can't use the collator->tertiaryMask, that is a
	199	* preprocessed mask that takes into account case options. since we are only
	200	* concerned with exact matches, we don't need that.
	201	* Alternate handling - since only the 16 most significant digits is only used,
	202	* we can safely do a compare without masking if the ce is a variable, we mask
	203	* and get only the primary values no shifting to quartenary is required since
	204	* all primary values less than variabletop will need to be masked off anyway.
	205	* If the end character is composite and the pattern ce does not match the text
	206	* ce, we skip it until we find a match in the end composite character or when
	207	* it has passed the character. This is so that we can match pattern "a" with
	208	* the text "\u00e6"
	209	* @param strsrch string search data
	210	* @param status error status if any
	211	* @return TRUE if an exact match is found, FALSE otherwise
	212	*/
	213	U_CFUNC
	214	UBool usearch_handleNextExact(UStringSearch strsrch, UErrorCode status);
	215
	216	/**
	217	* Canonical matches.
	218	* According to the definition, matches found here will include the whole span
	219	* of beginning and ending accents if it overlaps that region.
	220	* @param strsrch string search data
	221	* @param status error status if any
	222	* @return TRUE if a canonical match is found, FALSE otherwise
	223	*/
	224	U_CFUNC
	225	UBool usearch_handleNextCanonical(UStringSearch strsrch, UErrorCode status);
	226
	227	/**
	228	* Gets the previous match.
	229	* Comments follows from handleNextExact
	230	* @param strsrch string search data
	231	* @param status error status if any
	232	* @return True if a exact math is found, FALSE otherwise.
	233	*/
	234	U_CFUNC
	235	UBool usearch_handlePreviousExact(UStringSearch strsrch, UErrorCode status);
	236
	237	/**
	238	* Canonical matches.
	239	* According to the definition, matches found here will include the whole span
	240	* of beginning and ending accents if it overlaps that region.
	241	* @param strsrch string search data
	242	* @param status error status if any
243	* @return TRUE if a canonical match is found, FALSE otherwise
244	*/
245	U_CFUNC
246	UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
247	UErrorCode *status);
248
249	#endif /* #if !UCONFIG_NO_COLLATION */
250
251	#endif