git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	#
	2	# Copyright (C) 2002-2006, International Business Machines Corporation and others.
	3	# All Rights Reserved.
	4	#
	5	# file: sent.txt
	6	#
	7	# ICU Sentence Break Rules
	8	# See Unicode Standard Annex #29.
	9	# These rules are based on SA 29 version 5.0.0
	10	# Includes post 5.0 changes to treat Japanese half width voicing marks
	11	# as Grapheme Extend.
	12	#
	13
	14
	15	$VoiceMarks = [\uff9e\uff9f];
	16
	17	#
	18	# Character categories as defined in TR 29
	19	#
	20	$Sep = [\p{Sentence_Break = Sep}];
	21	$Format = [\p{Sentence_Break = Format}];
	22	$Sp = [\p{Sentence_Break = Sp}];
	23	$Lower = [\p{Sentence_Break = Lower}];
	24	$Upper = [\p{Sentence_Break = Upper}];
	25	$OLetter = [\p{Sentence_Break = OLetter}-$VoiceMarks];
	26	$Numeric = [\p{Sentence_Break = Numeric}];
	27	$ATerm = [\p{Sentence_Break = ATerm}];
	28	$STerm = [\p{Sentence_Break = STerm}];
	29	$Close = [\p{Sentence_Break = Close}];
	30
	31	#
	32	# Define extended forms of the character classes,
	33	# incorporate grapheme cluster + format chars.
	34	# Rules 4 and 5.
	35
	36
	37	$CR = \u000d;
	38	$LF = \u000a;
	39	$Extend = [[:Grapheme_Extend = TRUE:]$VoiceMarks];
	40
	41	$SpEx = $Sp ($Extend \| $Format)*;
	42	$LowerEx = $Lower ($Extend \| $Format)*;
	43	$UpperEx = $Upper ($Extend \| $Format)*;
	44	$OLetterEx = $OLetter ($Extend \| $Format)*;
	45	$NumericEx = $Numeric ($Extend \| $Format)*;
	46	$ATermEx = $ATerm ($Extend \| $Format)*;
	47	$STermEx = $STerm ($Extend \| $Format)*;
	48	$CloseEx = $Close ($Extend \| $Format)*;
	49
	50
	51	## -------------------------------------------------
	52
	53	!!chain;
	54	!!forward;
	55
	56	# Rule 3 - break after separators. Keep CR/LF together.
	57	#
	58	$CR $LF;
	59
	60
	61	# Rule 4 - Break after $Sep.
	62	# Rule 5 - Ignore $Format and $Extend
	63	#
	64	[^$Sep]? ($Extend \| $Format)*;
	65
	66
	67	# Rule 6
	68	$ATermEx $NumericEx;
	69
	70	# Rule 7
	71	$UpperEx $ATermEx $UpperEx;
	72
	73	#Rule 8
	74	# Note: follows errata for Unicode 5.0 boundary rules.
	75	$NotLettersEx = [^$OLetter $Upper $Lower $Sep $ATerm $STerm] ($Extend \| $Format)*;
	76	$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
	77
	78	# Rule 8a
	79	($STermEx \| $ATermEx) $CloseEx* $SpEx* ($STermEx \| $ATermEx);
	80
	81	#Rule 9, 10, 11
	82	($STermEx \| $ATermEx) $CloseEx* $SpEx* $Sep?;
	83
	84	#Rule 12
	85	[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] ($Extend \| $Format \| $Close \| $Sp)* .;
	86	[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] ($Extend \| $Format \| $Close \| $Sp)* ([$Sep{eof}] \| $CR $LF){100};
	87
	88	## -------------------------------------------------
	89
	90	!!reverse;
	91
	92	$SpEx_R = ($Extend \| $Format)* $Sp;
	93	$ATermEx_R = ($Extend \| $Format)* $ATerm;
	94	$STermEx_R = ($Extend \| $Format)* $STerm;
	95	$CloseEx_R = ($Extend \| $Format)* $Close;
	96
	97	#
	98	# Reverse rules.
	99	# For now, use the old style inexact reverse rules, which are easier
	100	# to write, but less efficient.
	101	# TODO: exact reverse rules. It appears that exact reverse rules
	102	# may require improving support for look-ahead breaks in the
	103	# builder. Needs more investigation.
	104	#
	105
	106	[{bof}] (.? \| $LF $CR) [^$Sep]* [$Sep {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R \| $ATermEx_R))*;
	107	#.*;
	108
	109	# Explanation for this rule:
	110	#
	111	# It needs to back over
	112	# The $Sep at which we probably begin
	113	# All of the non $Sep chars leading to the preceding $Sep
	114	# The preceding $Sep, which will be the second one that the rule matches.
	115	# Any immediately preceding STerm or ATerm sequences. We need to see these
	116	# to get the correct rule status when moving forwards again.
	117	#
	118	# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match
	119	# the entire string.
	120	#
	121	# (.? \| $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be
	122	# at the beginning of the string at this point, and we don't want to fail.
	123	# Can only use {eof} once, and it is used later.
	124	#