git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	//
	2	// Copyright (C) 2002-2012 International Business Machines Corporation
	3	// and others. All rights reserved.
	4	//
	5	// file: regeximp.h
	6	//
	7	// ICU Regular Expressions,
	8	// Definitions of constant values used in the compiled form of
	9	// a regular expression pattern.
	10	//
	11
	12	#ifndef _REGEXIMP_H
	13	#define _REGEXIMP_H
	14
	15	#include "unicode/utypes.h"
	16	#include "unicode/uobject.h"
	17	#include "unicode/uniset.h"
	18	#include "unicode/utext.h"
	19
	20	#include "cmemory.h"
	21	#include "ucase.h"
	22
	23	U_NAMESPACE_BEGIN
	24
	25	// For debugging, define REGEX_DEBUG
	26	// To define with configure,
	27	// ./runConfigureICU --enable-debug --disable-release Linux CPPFLAGS="-DREGEX_DEBUG"
	28
	29	#ifdef REGEX_DEBUG
	30	//
	31	// debugging options. Enable one or more of the three #defines immediately following
	32	//
	33
	34	//#define REGEX_SCAN_DEBUG
	35	#define REGEX_DUMP_DEBUG
	36	#define REGEX_RUN_DEBUG
	37
	38	// End of #defines inteded to be directly set.
	39
	40	#include <stdio.h>
	41	#endif
	42
	43	#ifdef REGEX_SCAN_DEBUG
	44	#define REGEX_SCAN_DEBUG_PRINTF(a) printf a
	45	#else
	46	#define REGEX_SCAN_DEBUG_PRINTF(a)
	47	#endif
	48
	49	#ifdef REGEX_DUMP_DEBUG
	50	#define REGEX_DUMP_DEBUG_PRINTF(a) printf a
	51	#else
	52	#define REGEX_DUMP_DEBUG_PRINTF(a)
	53	#endif
	54
	55	#ifdef REGEX_RUN_DEBUG
	56	#define REGEX_RUN_DEBUG_PRINTF(a) printf a
	57	#define REGEX_DUMP_DEBUG_PRINTF(a) printf a
	58	#else
	59	#define REGEX_RUN_DEBUG_PRINTF(a)
	60	#endif
	61
	62
	63	//
	64	// Opcode types In the compiled form of the regexp, these are the type, or opcodes,
	65	// of the entries.
	66	//
	67	enum {
	68	URX_RESERVED_OP = 0, // For multi-operand ops, most non-first words.
	69	URX_RESERVED_OP_N = 255, // For multi-operand ops, negative operand values.
	70	URX_BACKTRACK = 1, // Force a backtrack, as if a match test had failed.
	71	URX_END = 2,
	72	URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match
	73	URX_STRING = 4, // Value field is index of string start
	74	URX_STRING_LEN = 5, // Value field is string length (code units)
	75	URX_STATE_SAVE = 6, // Value field is pattern position to push
	76	URX_NOP = 7,
	77	URX_START_CAPTURE = 8, // Value field is capture group number.
	78	URX_END_CAPTURE = 9, // Value field is capture group number
	79	URX_STATIC_SETREF = 10, // Value field is index of set in array of sets.
	80	URX_SETREF = 11, // Value field is index of set in array of sets.
	81	URX_DOTANY = 12,
	82	URX_JMP = 13, // Value field is destination position in
	83	// the pattern.
	84	URX_FAIL = 14, // Stop match operation, No match.
	85
	86	URX_JMP_SAV = 15, // Operand: JMP destination location
	87	URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B
	88	URX_BACKSLASH_G = 17,
	89	URX_JMP_SAV_X = 18, // Conditional JMP_SAV,
	90	// Used in (x)+, breaks loop on zero length match.
	91	// Operand: Jmp destination.
	92	URX_BACKSLASH_X = 19,
	93	URX_BACKSLASH_Z = 20, // \z Unconditional end of line.
	94
	95	URX_DOTANY_ALL = 21, // ., in the . matches any mode.
	96	URX_BACKSLASH_D = 22, // Value field: 0: \d 1: \D
	97	URX_CARET = 23, // Value field: 1: multi-line mode.
	98	URX_DOLLAR = 24, // Also for \Z
	99
	100	URX_CTR_INIT = 25, // Counter Inits for {Interval} loops.
	101	URX_CTR_INIT_NG = 26, // 2 kinds, normal and non-greedy.
	102	// These are 4 word opcodes. See description.
	103	// First Operand: Data loc of counter variable
	104	// 2nd Operand: Pat loc of the URX_CTR_LOOPx
	105	// at the end of the loop.
	106	// 3rd Operand: Minimum count.
	107	// 4th Operand: Max count, -1 for unbounded.
	108
	109	URX_DOTANY_UNIX = 27, // '.' operator in UNIX_LINES mode, only \n marks end of line.
	110
	111	URX_CTR_LOOP = 28, // Loop Ops for {interval} loops.
	112	URX_CTR_LOOP_NG = 29, // Also in three flavors.
	113	// Operand is loc of corresponding CTR_INIT.
	114
	115	URX_CARET_M_UNIX = 30, // '^' operator, test for start of line in multi-line
	116	// plus UNIX_LINES mode.
	117
	118	URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers
	119	// back into compiled pattern code, and thus must
	120	// be relocated when inserting/deleting ops in code.
	121
	122	URX_STO_SP = 32, // Store the stack ptr. Operand is location within
	123	// matcher data (not stack data) to store it.
	124	URX_LD_SP = 33, // Load the stack pointer. Operand is location
	125	// to load from.
	126	URX_BACKREF = 34, // Back Reference. Parameter is the index of the
	127	// capture group variables in the state stack frame.
	128	URX_STO_INP_LOC = 35, // Store the input location. Operand is location
	129	// within the matcher stack frame.
	130	URX_JMPX = 36, // Conditional JMP.
	131	// First Operand: JMP target location.
	132	// Second Operand: Data location containing an
	133	// input position. If current input position ==
	134	// saved input position, FAIL rather than taking
	135	// the JMP
	136	URX_LA_START = 37, // Starting a LookAround expression.
	137	// Save InputPos and SP in static data.
	138	// Operand: Static data offset for the save
	139	URX_LA_END = 38, // Ending a Lookaround expression.
	140	// Restore InputPos and Stack to saved values.
	141	// Operand: Static data offset for saved data.
	142	URX_ONECHAR_I = 39, // Test for case-insensitive match of a literal character.
	143	// Operand: the literal char.
	144	URX_STRING_I = 40, // Case insensitive string compare.
	145	// First Operand: Index of start of string in string literals
	146	// Second Operand (next word in compiled code):
	147	// the length of the string.
	148	URX_BACKREF_I = 41, // Case insensitive back reference.
	149	// Parameter is the index of the
	150	// capture group variables in the state stack frame.
	151	URX_DOLLAR_M = 42, // $ in multi-line mode.
	152	URX_CARET_M = 43, // ^ in multi-line mode.
	153	URX_LB_START = 44, // LookBehind Start.
	154	// Paramater is data location
	155	URX_LB_CONT = 45, // LookBehind Continue.
	156	// Param 0: the data location
	157	// Param 1: The minimum length of the look-behind match
	158	// Param 2: The max length of the look-behind match
	159	URX_LB_END = 46, // LookBehind End.
	160	// Parameter is the data location.
	161	// Check that match ended at the right spot,
	162	// Restore original input string len.
	163	URX_LBN_CONT = 47, // Negative LookBehind Continue
	164	// Param 0: the data location
	165	// Param 1: The minimum length of the look-behind match
	166	// Param 2: The max length of the look-behind match
	167	// Param 3: The pattern loc following the look-behind block.
	168	URX_LBN_END = 48, // Negative LookBehind end
	169	// Parameter is the data location.
	170	// Check that the match ended at the right spot.
	171	URX_STAT_SETREF_N = 49, // Reference to a prebuilt set (e.g. \w), negated
	172	// Operand is index of set in array of sets.
	173	URX_LOOP_SR_I = 50, // Init a [set]* loop.
	174	// Operand is the sets index in array of user sets.
	175	URX_LOOP_C = 51, // Continue a [set]* or OneChar* loop.
	176	// Operand is a matcher static data location.
	177	// Must always immediately follow LOOP_x_I instruction.
	178	URX_LOOP_DOT_I = 52, // .*, initialization of the optimized loop.
	179	// Operand value:
	180	// bit 0:
	181	// 0: Normal (. doesn't match new-line) mode.
	182	// 1: . matches new-line mode.
	183	// bit 1: controls what new-lines are recognized by this operation.
	184	// 0: All Unicode New-lines
	185	// 1: UNIX_LINES, \u000a only.
	186	URX_BACKSLASH_BU = 53, // \b or \B in UREGEX_UWORD mode, using Unicode style
	187	// word boundaries.
	188	URX_DOLLAR_D = 54, // $ end of input test, in UNIX_LINES mode.
	189	URX_DOLLAR_MD = 55 // $ end of input test, in MULTI_LINE and UNIX_LINES mode.
	190
	191	};
	192
	193	// Keep this list of opcode names in sync with the above enum
	194	// Used for debug printing only.
	195	#define URX_OPCODE_NAMES \
	196	" ", \
	197	"BACKTRACK", \
	198	"END", \
	199	"ONECHAR", \
	200	"STRING", \
	201	"STRING_LEN", \
	202	"STATE_SAVE", \
	203	"NOP", \
	204	"START_CAPTURE", \
	205	"END_CAPTURE", \
	206	"URX_STATIC_SETREF", \
	207	"SETREF", \
	208	"DOTANY", \
	209	"JMP", \
	210	"FAIL", \
	211	"JMP_SAV", \
	212	"BACKSLASH_B", \
	213	"BACKSLASH_G", \
	214	"JMP_SAV_X", \
	215	"BACKSLASH_X", \
	216	"BACKSLASH_Z", \
	217	"DOTANY_ALL", \
	218	"BACKSLASH_D", \
	219	"CARET", \
	220	"DOLLAR", \
	221	"CTR_INIT", \
	222	"CTR_INIT_NG", \
	223	"DOTANY_UNIX", \
	224	"CTR_LOOP", \
	225	"CTR_LOOP_NG", \
	226	"URX_CARET_M_UNIX", \
	227	"RELOC_OPRND", \
	228	"STO_SP", \
	229	"LD_SP", \
	230	"BACKREF", \
	231	"STO_INP_LOC", \
	232	"JMPX", \
	233	"LA_START", \
	234	"LA_END", \
	235	"ONECHAR_I", \
	236	"STRING_I", \
	237	"BACKREF_I", \
	238	"DOLLAR_M", \
	239	"CARET_M", \
	240	"LB_START", \
	241	"LB_CONT", \
	242	"LB_END", \
	243	"LBN_CONT", \
	244	"LBN_END", \
	245	"STAT_SETREF_N", \
	246	"LOOP_SR_I", \
	247	"LOOP_C", \
	248	"LOOP_DOT_I", \
	249	"BACKSLASH_BU", \
	250	"DOLLAR_D", \
	251	"DOLLAR_MD"
	252
	253
	254	//
	255	// Convenience macros for assembling and disassembling a compiled operation.
	256	//
	257	#define URX_BUILD(type, val) (int32_t)((type << 24) \| (val))
	258	#define URX_TYPE(x) ((uint32_t)(x) >> 24)
	259	#define URX_VAL(x) ((x) & 0xffffff)
	260
	261
	262	//
	263	// Access to Unicode Sets composite character properties
	264	// The sets are accessed by the match engine for things like \w (word boundary)
	265	//
	266	enum {
	267	URX_ISWORD_SET = 1,
	268	URX_ISALNUM_SET = 2,
	269	URX_ISALPHA_SET = 3,
	270	URX_ISSPACE_SET = 4,
	271
	272	URX_GC_NORMAL, // Sets for finding grapheme cluster boundaries.
	273	URX_GC_EXTEND,
	274	URX_GC_CONTROL,
	275	URX_GC_L,
	276	URX_GC_LV,
	277	URX_GC_LVT,
	278	URX_GC_V,
	279	URX_GC_T,
	280
	281	URX_LAST_SET,
	282
	283	URX_NEG_SET = 0x800000 // Flag bit to reverse sense of set
	284	// membership test.
	285	};
	286
	287
	288	//
	289	// Match Engine State Stack Frame Layout.
	290	//
	291	struct REStackFrame {
	292	// Header
	293	int64_t fInputIdx; // Position of next character in the input string
	294	int64_t fPatIdx; // Position of next Op in the compiled pattern
	295	// (int64_t for UVector64, values fit in an int32_t)
	296	// Remainder
	297	int64_t fExtra[1]; // Extra state, for capture group start/ends
	298	// atomic parentheses, repeat counts, etc.
	299	// Locations assigned at pattern compile time.
	300	// Variable-length array.
	301	};
	302	// number of UVector elements in the header
	303	#define RESTACKFRAME_HDRCOUNT 2
	304
	305	//
	306	// Start-Of-Match type. Used by find() to quickly scan to positions where a
	307	// match might start before firing up the full match engine.
	308	//
	309	enum StartOfMatch {
	310	START_NO_INFO, // No hint available.
	311	START_CHAR, // Match starts with a literal code point.
	312	START_SET, // Match starts with something matching a set.
	313	START_START, // Match starts at start of buffer only (^ or \A)
	314	START_LINE, // Match starts with ^ in multi-line mode.
	315	START_STRING // Match starts with a literal string.
	316	};
	317
	318	#define START_OF_MATCH_STR(v) ((v)==START_NO_INFO? "START_NO_INFO" : \
	319	(v)==START_CHAR? "START_CHAR" : \
	320	(v)==START_SET? "START_SET" : \
	321	(v)==START_START? "START_START" : \
	322	(v)==START_LINE? "START_LINE" : \
	323	(v)==START_STRING? "START_STRING" : \
	324	"ILLEGAL")
	325
	326	//
	327	// 8 bit set, to fast-path latin-1 set membership tests.
	328	//
	329	struct Regex8BitSet : public UMemory {
	330	inline Regex8BitSet();
	331	inline void operator = (const Regex8BitSet &s);
	332	inline void init(const UnicodeSet *src);
	333	inline UBool contains(UChar32 c);
	334	inline void add(UChar32 c);
	335	int8_t d[32];
	336	};
	337
	338	inline Regex8BitSet::Regex8BitSet() {
	339	uprv_memset(d, 0, sizeof(d));
	340	}
	341
	342	inline UBool Regex8BitSet::contains(UChar32 c) {
	343	// No bounds checking! This is deliberate.
	344	return ((d[c>>3] & 1 <<(c&7)) != 0);
	345	}
	346
	347	inline void Regex8BitSet::add(UChar32 c) {
	348	d[c>>3] \|= 1 << (c&7);
	349	}
	350
	351	inline void Regex8BitSet::init(const UnicodeSet *s) {
	352	if (s != NULL) {
	353	for (int32_t i=0; i<=255; i++) {
	354	if (s->contains(i)) {
	355	this->add(i);
	356	}
	357	}
	358	}
	359	}
	360
	361	inline void Regex8BitSet::operator = (const Regex8BitSet &s) {
	362	uprv_memcpy(d, s.d, sizeof(d));
	363	}
	364
	365
	366	// Case folded UText Iterator helper class.
	367	// Wraps a UText, provides a case-folded enumeration over its contents.
	368	// Used in implementing case insensitive matching constructs.
	369	// Implementation in rematch.cpp
	370
	371	class CaseFoldingUTextIterator: public UMemory {
	372	public:
	373	CaseFoldingUTextIterator(UText &text);
	374	~CaseFoldingUTextIterator();
	375
	376	UChar32 next(); // Next case folded character
	377
	378	UBool inExpansion(); // True if last char returned from next() and the
	379	// next to be returned both originated from a string
	380	// folding of the same code point from the orignal UText.
	381	private:
	382	UText &fUText;
	383	const UCaseProps *fcsp;
	384	const UChar *fFoldChars;
	385	int32_t fFoldLength;
	386	int32_t fFoldIndex;
	387
	388	};
	389
	390
	391	// Case folded UChar * string iterator.
	392	// Wraps a UChar *, provides a case-folded enumeration over its contents.
	393	// Used in implementing case insensitive matching constructs.
	394	// Implementation in rematch.cpp
	395
	396	class CaseFoldingUCharIterator: public UMemory {
	397	public:
	398	CaseFoldingUCharIterator(const UChar *chars, int64_t start, int64_t limit);
	399	~CaseFoldingUCharIterator();
	400
	401	UChar32 next(); // Next case folded character
	402
	403	UBool inExpansion(); // True if last char returned from next() and the
	404	// next to be returned both originated from a string
	405	// folding of the same code point from the orignal UText.
	406
	407	int64_t getIndex(); // Return the current input buffer index.
	408
	409	private:
	410	const UChar *fChars;
	411	int64_t fIndex;
	412	int64_t fLimit;
	413	const UCaseProps *fcsp;
	414	const UChar *fFoldChars;
	415	int32_t fFoldLength;
	416	int32_t fFoldIndex;
	417
	418	};
	419
	420	U_NAMESPACE_END
	421	#endif
	422