git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 2002-2003, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: punycode.c
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2002jan31
	14	* created by: Markus W. Scherer
	15	*/
	16
	17
	18	/* This ICU code derived from: */
	19	/*
	20	punycode.c 0.4.0 (2001-Nov-17-Sat)
	21	http://www.cs.berkeley.edu/~amc/idn/
	22	Adam M. Costello
	23	http://www.nicemice.net/amc/
	24
	25	Disclaimer and license
	26
	27	Regarding this entire document or any portion of it (including
	28	the pseudocode and C code), the author makes no guarantees and
	29	is not responsible for any damage resulting from its use. The
	30	author grants irrevocable permission to anyone to use, modify,
	31	and distribute it in any way that does not diminish the rights
	32	of anyone else to use, modify, and distribute it, provided that
	33	redistributed derivative works do not contain misleading author or
	34	version information. Derivative works need not be licensed under
	35	similar terms.
	36	*/
	37	/*
	38	* ICU modifications:
	39	* - ICU data types and coding conventions
	40	* - ICU string buffer handling with implicit source lengths
	41	* and destination preflighting
	42	* - UTF-16 handling
	43	*/
	44
	45	#include "unicode/utypes.h"
	46
	47	#if !UCONFIG_NO_IDNA
	48
	49	#include "ustr_imp.h"
	50	#include "cstring.h"
	51	#include "cmemory.h"
	52	#include "punycode.h"
	53	#include "unicode/ustring.h"
	54
	55
	56	/* Punycode ----------------------------------------------------------------- */
	57
	58	/* Punycode parameters for Bootstring */
	59	#define BASE 36
	60	#define TMIN 1
	61	#define TMAX 26
	62	#define SKEW 38
	63	#define DAMP 700
	64	#define INITIAL_BIAS 72
	65	#define INITIAL_N 0x80
	66
	67	/* "Basic" Unicode/ASCII code points */
	68	#define _HYPHEN 0X2d
	69	#define DELIMITER _HYPHEN
	70
	71	#define _ZERO_ 0X30
	72	#define _NINE 0x39
	73
	74	#define _SMALL_A 0X61
	75	#define _SMALL_Z 0X7a
	76
	77	#define _CAPITAL_A 0X41
	78	#define _CAPITAL_Z 0X5a
	79
	80	#define IS_BASIC(c) ((c)<0x80)
	81	#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
	82
	83	/**
	84	* digitToBasic() returns the basic code point whose value
	85	* (when used for representing integers) is d, which must be in the
	86	* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
	87	* nonzero, in which case the uppercase form is used.
	88	*/
	89	static U_INLINE char
	90	digitToBasic(int32_t digit, UBool uppercase) {
	91	/* 0..25 map to ASCII a..z or A..Z */
	92	/* 26..35 map to ASCII 0..9 */
	93	if(digit<26) {
	94	if(uppercase) {
	95	return (char)(_CAPITAL_A+digit);
	96	} else {
	97	return (char)(_SMALL_A+digit);
	98	}
	99	} else {
	100	return (char)((_ZERO_-26)+digit);
	101	}
	102	}
	103
	104	/**
	105	* basicToDigit[] contains the numeric value of a basic code
	106	* point (for use in representing integers) in the range 0 to
	107	* BASE-1, or -1 if b is does not represent a value.
	108	*/
	109	static const int8_t
	110	basicToDigit[256]={
	111	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	112	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	113
	114	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	115	26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
	116
	117	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
	118	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
	119
	120	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
	121	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
	122
	123	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	124	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	125
	126	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	127	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	128
	129	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	130	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	131
	132	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	133	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
	134	};
	135
	136	static U_INLINE char
	137	asciiCaseMap(char b, UBool uppercase) {
	138	if(uppercase) {
	139	if(_SMALL_A<=b && b<=_SMALL_Z) {
	140	b-=(_SMALL_A-_CAPITAL_A);
	141	}
	142	} else {
	143	if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
	144	b+=(_SMALL_A-_CAPITAL_A);
	145	}
	146	}
	147	return b;
	148	}
	149
	150	/* Punycode-specific Bootstring code ---------------------------------------- */
	151
	152	/*
	153	* The following code omits the {parts} of the pseudo-algorithm in the spec
	154	* that are not used with the Punycode parameter set.
	155	*/
	156
	157	/* Bias adaptation function. */
	158	static int32_t
	159	adaptBias(int32_t delta, int32_t length, UBool firstTime) {
	160	int32_t count;
	161
	162	if(firstTime) {
	163	delta/=DAMP;
	164	} else {
	165	delta/=2;
	166	}
	167
	168	delta+=delta/length;
	169	for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
	170	delta/=(BASE-TMIN);
	171	}
	172
	173	return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
	174	}
	175
	176	#define MAX_CP_COUNT 200
	177
	178	U_CFUNC int32_t
	179	u_strToPunycode(const UChar *src, int32_t srcLength,
	180	UChar *dest, int32_t destCapacity,
	181	const UBool *caseFlags,
	182	UErrorCode *pErrorCode) {
	183
	184	int32_t cpBuffer[MAX_CP_COUNT];
	185	int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
	186	UChar c, c2;
	187
	188	/* argument checking */
	189	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
	190	return 0;
	191	}
	192
	193	if(src==NULL \|\| srcLength<-1 \|\| (dest==NULL && destCapacity!=0)) {
	194	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
	195	return 0;
	196	}
	197
	198	/*
	199	* Handle the basic code points and
	200	* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
	201	*/
	202	srcCPCount=destLength=0;
	203	if(srcLength==-1) {
	204	/* NUL-terminated input */
	205	for(j=0; /* no condition */; ++j) {
	206	if((c=src[j])==0) {
	207	break;
	208	}
	209	if(srcCPCount==MAX_CP_COUNT) {
	210	/* too many input code points */
	211	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
	212	return 0;
	213	}
	214	if(IS_BASIC(c)) {
	215	cpBuffer[srcCPCount++]=0;
	216	if(destLength<destCapacity) {
	217	dest[destLength]=
	218	caseFlags!=NULL ?
	219	asciiCaseMap((char)c, caseFlags[j]) :
	220	(char)c;
	221	}
	222	++destLength;
	223	} else {
	224	n=(caseFlags!=NULL && caseFlags[j])<<31L;
	225	if(UTF_IS_SINGLE(c)) {
	226	n\|=c;
	227	} else if(UTF_IS_LEAD(c) && UTF_IS_TRAIL(c2=src[j+1])) {
	228	++j;
	229	n\|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
	230	} else {
	231	/* error: unmatched surrogate */
	232	*pErrorCode=U_INVALID_CHAR_FOUND;
	233	return 0;
	234	}
	235	cpBuffer[srcCPCount++]=n;
	236	}
	237	}
	238	} else {
	239	/* length-specified input */
	240	for(j=0; j<srcLength; ++j) {
	241	if(srcCPCount==MAX_CP_COUNT) {
	242	/* too many input code points */
	243	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
	244	return 0;
	245	}
	246	c=src[j];
	247	if(IS_BASIC(c)) {
	248	if(destLength<destCapacity) {
	249	cpBuffer[srcCPCount++]=0;
	250	dest[destLength]=
	251	caseFlags!=NULL ?
	252	asciiCaseMap((char)c, caseFlags[j]) :
	253	(char)c;
	254	}
	255	++destLength;
	256	} else {
	257	n=(caseFlags!=NULL && caseFlags[j])<<31L;
	258	if(UTF_IS_SINGLE(c)) {
	259	n\|=c;
	260	} else if(UTF_IS_LEAD(c) && (j+1)<srcLength && UTF_IS_TRAIL(c2=src[j+1])) {
	261	++j;
	262	n\|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
	263	} else {
	264	/* error: unmatched surrogate */
	265	*pErrorCode=U_INVALID_CHAR_FOUND;
	266	return 0;
	267	}
	268	cpBuffer[srcCPCount++]=n;
	269	}
	270	}
	271	}
	272
	273	/* Finish the basic string - if it is not empty - with a delimiter. */
	274	basicLength=destLength;
	275	if(basicLength>0) {
	276	if(destLength<destCapacity) {
	277	dest[destLength]=DELIMITER;
	278	}
	279	++destLength;
	280	}
	281
	282	/*
	283	* handledCPCount is the number of code points that have been handled
	284	* basicLength is the number of basic code points
	285	* destLength is the number of chars that have been output
	286	*/
	287
	288	/* Initialize the state: */
	289	n=INITIAL_N;
	290	delta=0;
	291	bias=INITIAL_BIAS;
	292
	293	/* Main encoding loop: */
	294	for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
	295	/*
	296	* All non-basic code points < n have been handled already.
	297	* Find the next larger one:
	298	*/
	299	for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
	300	q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
	301	if(n<=q && q<m) {
	302	m=q;
	303	}
	304	}
	305
	306	/*
	307	* Increase delta enough to advance the decoder's
	308	* <n,i> state to <m,0>, but guard against overflow:
	309	*/
	310	if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
	311	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
	312	return 0;
	313	}
	314	delta+=(m-n)*(handledCPCount+1);
	315	n=m;
	316
	317	/* Encode a sequence of same code points n */
	318	for(j=0; j<srcCPCount; ++j) {
	319	q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
	320	if(q<n) {
	321	++delta;
	322	} else if(q==n) {
	323	/* Represent delta as a generalized variable-length integer: */
	324	for(q=delta, k=BASE; /* no condition */; k+=BASE) {
	325
	326	/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
	327
	328	t=k-bias;
	329	if(t<TMIN) {
	330	t=TMIN;
	331	} else if(t>TMAX) {
	332	t=TMAX;
	333	}
	334	*/
	335
	336	t=k-bias;
	337	if(t<TMIN) {
	338	t=TMIN;
	339	} else if(k>=(bias+TMAX)) {
	340	t=TMAX;
	341	}
	342
	343	if(q<t) {
	344	break;
	345	}
	346
	347	if(destLength<destCapacity) {
	348	dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), 0);
	349	}
	350	q=(q-t)/(BASE-t);
	351	}
	352
	353	if(destLength<destCapacity) {
	354	dest[destLength++]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
	355	}
	356	bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
	357	delta=0;
	358	++handledCPCount;
	359	}
	360	}
	361
	362	++delta;
	363	++n;
	364	}
	365
	366	return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
	367	}
	368
	369	U_CFUNC int32_t
	370	u_strFromPunycode(const UChar *src, int32_t srcLength,
	371	UChar *dest, int32_t destCapacity,
	372	UBool *caseFlags,
	373	UErrorCode *pErrorCode) {
	374	int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
	375	destCPCount, firstSupplementaryIndex, cpLength;
	376	UChar b;
	377
	378	/* argument checking */
	379	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
	380	return 0;
	381	}
	382
	383	if(src==NULL \|\| srcLength<-1 \|\| (dest==NULL && destCapacity!=0)) {
	384	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
	385	return 0;
	386	}
	387
	388	if(srcLength==-1) {
	389	srcLength=u_strlen(src);
	390	}
	391
	392	/*
	393	* Handle the basic code points:
	394	* Let basicLength be the number of input code points
	395	* before the last delimiter, or 0 if there is none,
	396	* then copy the first basicLength code points to the output.
	397	*
	398	* The two following loops iterate backward.
	399	*/
	400	for(j=srcLength; j>0;) {
	401	if(src[--j]==DELIMITER) {
	402	break;
	403	}
	404	}
	405	destLength=basicLength=destCPCount=j;
	406
	407	while(j>0) {
	408	b=src[--j];
	409	if(!IS_BASIC(b)) {
	410	*pErrorCode=U_INVALID_CHAR_FOUND;
	411	return 0;
	412	}
	413
	414	if(j<destCapacity) {
	415	dest[j]=(UChar)b;
	416
	417	if(caseFlags!=NULL) {
	418	caseFlags[j]=IS_BASIC_UPPERCASE(b);
	419	}
	420	}
	421	}
	422
	423	/* Initialize the state: */
	424	n=INITIAL_N;
	425	i=0;
	426	bias=INITIAL_BIAS;
	427	firstSupplementaryIndex=1000000000;
	428
	429	/*
	430	* Main decoding loop:
	431	* Start just after the last delimiter if any
	432	* basic code points were copied; start at the beginning otherwise.
	433	*/
	434	for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
	435	/*
	436	* in is the index of the next character to be consumed, and
	437	* destCPCount is the number of code points in the output array.
	438	*
	439	* Decode a generalized variable-length integer into delta,
	440	* which gets added to i. The overflow checking is easier
	441	* if we increase i as we go, then subtract off its starting
	442	* value at the end to obtain delta.
	443	*/
	444	for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
	445	if(in>=srcLength) {
	446	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
	447	return 0;
	448	}
	449
	450	digit=basicToDigit[(uint8_t)src[in++]];
	451	if(digit<0) {
	452	*pErrorCode=U_INVALID_CHAR_FOUND;
	453	return 0;
	454	}
	455	if(digit>(0x7fffffff-i)/w) {
	456	/* integer overflow */
	457	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
	458	return 0;
	459	}
	460
	461	i+=digit*w;
	462	/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
	463	t=k-bias;
	464	if(t<TMIN) {
	465	t=TMIN;
	466	} else if(t>TMAX) {
	467	t=TMAX;
	468	}
	469	*/
	470	t=k-bias;
	471	if(t<TMIN) {
	472	t=TMIN;
	473	} else if(k>=(bias+TMAX)) {
	474	t=TMAX;
	475	}
	476	if(digit<t) {
	477	break;
	478	}
	479
	480	if(w>0x7fffffff/(BASE-t)) {
	481	/* integer overflow */
	482	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
	483	return 0;
	484	}
	485	w*=BASE-t;
	486	}
	487
	488	/*
	489	* Modification from sample code:
	490	* Increments destCPCount here,
	491	* where needed instead of in for() loop tail.
	492	*/
	493	++destCPCount;
	494	bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
	495
	496	/*
	497	* i was supposed to wrap around from (incremented) destCPCount to 0,
	498	* incrementing n each time, so we'll fix that now:
	499	*/
	500	if(i/destCPCount>(0x7fffffff-n)) {
	501	/* integer overflow */
	502	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
	503	return 0;
	504	}
	505
	506	n+=i/destCPCount;
	507	i%=destCPCount;
	508	/* not needed for Punycode: */
	509	/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
	510
	511	if(n>0x10ffff \|\| UTF_IS_SURROGATE(n)) {
	512	/* Unicode code point overflow */
	513	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
	514	return 0;
	515	}
	516
	517	/* Insert n at position i of the output: */
	518	cpLength=UTF_CHAR_LENGTH(n);
	519	if((destLength+cpLength)<destCapacity) {
	520	int32_t codeUnitIndex;
	521
	522	/*
	523	* Handle indexes when supplementary code points are present.
	524	*
	525	* In almost all cases, there will be only BMP code points before i
	526	* and even in the entire string.
	527	* This is handled with the same efficiency as with UTF-32.
	528	*
	529	* Only the rare cases with supplementary code points are handled
	530	* more slowly - but not too bad since this is an insertion anyway.
	531	*/
	532	if(i<=firstSupplementaryIndex) {
	533	codeUnitIndex=i;
	534	if(cpLength>1) {
	535	firstSupplementaryIndex=codeUnitIndex;
	536	} else {
	537	++firstSupplementaryIndex;
	538	}
	539	} else {
	540	codeUnitIndex=firstSupplementaryIndex;
	541	UTF_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
	542	}
	543
	544	/* use the UChar index codeUnitIndex instead of the code point index i */
	545	if(codeUnitIndex<destLength) {
	546	uprv_memmove(dest+codeUnitIndex+cpLength,
	547	dest+codeUnitIndex,
	548	(destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
	549	if(caseFlags!=NULL) {
	550	uprv_memmove(caseFlags+codeUnitIndex+cpLength,
	551	caseFlags+codeUnitIndex,
	552	destLength-codeUnitIndex);
	553	}
	554	}
	555	if(cpLength==1) {
	556	/* BMP, insert one code unit */
	557	dest[codeUnitIndex]=(UChar)n;
	558	} else {
	559	/* supplementary character, insert two code units */
	560	dest[codeUnitIndex]=UTF16_LEAD(n);
	561	dest[codeUnitIndex+1]=UTF16_TRAIL(n);
	562	}
	563	if(caseFlags!=NULL) {
	564	/* Case of last character determines uppercase flag: */
	565	caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
	566	if(cpLength==2) {
	567	caseFlags[codeUnitIndex+1]=FALSE;
	568	}
	569	}
	570	}
	571	destLength+=cpLength;
	572	++i;
	573	}
	574
	575	return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
	576	}
	577
	578	/* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
	579
	580	#endif /* #if !UCONFIG_NO_IDNA */