[apple/icu.git] / icuSources / test / intltest / punyref.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 *
 *   Copyright (C) 2003-2005, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
 *   file name:  punyref.cpp
 *   encoding:   UTF-8
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 2003feb1
 *   created by: Ram Viswanadha
 */

/*
Disclaimer and license

    Regarding this entire document or any portion of it (including
    the pseudocode and C code), the author makes no guarantees and
    is not responsible for any damage resulting from its use.  The
    author grants irrevocable permission to anyone to use, modify,
    and distribute it in any way that does not diminish the rights
    of anyone else to use, modify, and distribute it, provided that
    redistributed derivative works do not contain misleading author or
    version information.  Derivative works need not be licensed under
    similar terms.

punycode.c 0.4.0 (2001-Nov-17-Sat)
http://www.cs.berkeley.edu/~amc/idn/
Adam M. Costello
http://www.nicemice.net/amc/
*/

/**********************************************************/
/* Implementation (would normally go in its own .c file): */

#include <string.h>

#include "unicode/utypes.h"

#if !UCONFIG_NO_IDNA

#include "punyref.h"

/*** Bootstring parameters for Punycode ***/

enum { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
       initial_bias = 72, initial_n = 0x80, delimiter = 0x2D };

/* basic(cp) tests whether cp is a basic code point: */
#define basic(cp) ((punycode_uint)(cp) < 0x80)

/* delim(cp) tests whether cp is a delimiter: */
#define delim(cp) ((cp) == delimiter)

U_CDECL_BEGIN
/* decode_digit(cp) returns the numeric value of a basic code */
/* point (for use in representing integers) in the range 0 to */
/* base-1, or base if cp is does not represent a value.       */

static punycode_uint decode_digit(punycode_uint cp)
{
  return  cp - 48 < 10 ? cp - 22 :  cp - 65 < 26 ? cp - 65 :
          cp - 97 < 26 ? cp - 97 :  base;
}

/* encode_digit(d,flag) returns the basic code point whose value      */
/* (when used for representing integers) is d, which needs to be in   */
/* the range 0 to base-1.  The lowercase form is used unless flag is  */
/* nonzero, in which case the uppercase form is used.  The behavior   */
/* is undefined if flag is nonzero and digit d has no uppercase form. */

static char encode_digit(punycode_uint d, int flag)
{
  return (char) d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
  /*  0..25 map to ASCII a..z or A..Z */
  /* 26..35 map to ASCII 0..9         */
}

/* flagged(bcp) tests whether a basic code point is flagged */
/* (uppercase).  The behavior is undefined if bcp is not a  */
/* basic code point.                                        */

#define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)

/* encode_basic(bcp,flag) forces a basic code point to lowercase */
/* if flag is zero, uppercase if flag is nonzero, and returns    */
/* the resulting code point.  The code point is unchanged if it  */
/* is caseless.  The behavior is undefined if bcp is not a basic */
/* code point.                                                   */

static char encode_basic(punycode_uint bcp, int flag)
{
  bcp -= (bcp - 97 < 26) << 5;
  return (char) bcp + ((!flag && (bcp - 65 < 26)) << 5);
}

/*** Platform-specific constants ***/

/* maxint is the maximum value of a punycode_uint variable: */
static const punycode_uint maxint = (punycode_uint) (-1);
/* Because maxint is unsigned, -1 becomes the maximum value. */

/*** Bias adaptation function ***/

static punycode_uint adapt(
  punycode_uint delta, punycode_uint numpoints, int firsttime )
{
  punycode_uint k;

  delta = firsttime ? delta / damp : delta >> 1;
  /* delta >> 1 is a faster way of doing delta / 2 */
  delta += delta / numpoints;

  for (k = 0;  delta > ((base - tmin) * tmax) / 2;  k += base) {
    delta /= base - tmin;
  }

  return k + (base - tmin + 1) * delta / (delta + skew);
}

/*** Main encode function ***/

enum punycode_status punycode_encode(
  punycode_uint input_length,
  const punycode_uint input[],
  const unsigned char case_flags[],
  punycode_uint *output_length,
  char output[] )
{
  punycode_uint n, delta, h, b, out, max_out, bias, j, m, q, k, t;

  /* Initialize the state: */

  n = initial_n;
  delta = out = 0;
  max_out = *output_length;
  bias = initial_bias;

  /* Handle the basic code points: */

  for (j = 0;  j < input_length;  ++j) {
    if (basic(input[j])) {
      if (max_out - out < 2) return punycode_big_output;
      output[out++] = (char)
        (case_flags ?  encode_basic(input[j], case_flags[j]) : input[j]);
    }
    /* else if (input[j] < n) return punycode_bad_input; */
    /* (not needed for Punycode with unsigned code points) */
  }

  h = b = out;

  /* h is the number of code points that have been handled, b is the  */
  /* number of basic code points, and out is the number of characters */
  /* that have been output.                                           */

  if (b > 0) output[out++] = delimiter;

  /* Main encoding loop: */

  while (h < input_length) {
    /* All non-basic code points < n have been     */
    /* handled already.  Find the next larger one: */

    for (m = maxint, j = 0;  j < input_length;  ++j) {
      /* if (basic(input[j])) continue; */
      /* (not needed for Punycode) */
      if (input[j] >= n && input[j] < m) m = input[j];
    }

    /* Increase delta enough to advance the decoder's    */
    /* <n,i> state to <m,0>, but guard against overflow: */

    if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow;
    delta += (m - n) * (h + 1);
    n = m;

    for (j = 0;  j < input_length;  ++j) {
      /* Punycode does not need to check whether input[j] is basic: */
      if (input[j] < n /* || basic(input[j]) */ ) {
        if (++delta == 0) return punycode_overflow;
      }

      if (input[j] == n) {
        /* Represent delta as a generalized variable-length integer: */

        for (q = delta, k = base;  ;  k += base) {
          if (out >= max_out) return punycode_big_output;
          t = k <= bias /* + tmin */ ? tmin :     /* +tmin not needed */
              k >= bias + tmax ? tmax : k - bias;
          if (q < t) break;
          output[out++] = encode_digit(t + (q - t) % (base - t), 0);
          q = (q - t) / (base - t);
        }

        output[out++] = encode_digit(q, case_flags && case_flags[j]);
        bias = adapt(delta, h + 1, h == b);
        delta = 0;
        ++h;
      }
    }

    ++delta, ++n;
  }

  *output_length = out;
  return punycode_success;
}

/*** Main decode function ***/

enum punycode_status punycode_decode(
  punycode_uint input_length,
  const char input[],
  punycode_uint *output_length,
  punycode_uint output[],
  unsigned char case_flags[] )
{
  punycode_uint n, out, i, max_out, bias,
                 b, j, in, oldi, w, k, digit, t;

  /* Initialize the state: */

  n = initial_n;
  out = i = 0;
  max_out = *output_length;
  bias = initial_bias;

  /* Handle the basic code points:  Let b be the number of input code */
  /* points before the last delimiter, or 0 if there is none, then    */
  /* copy the first b code points to the output.                      */

  for (b = j = 0;  j < input_length;  ++j) if (delim(input[j])) b = j;
  if (b > max_out) return punycode_big_output;

  for (j = 0;  j < b;  ++j) {
    if (case_flags) case_flags[out] = flagged(input[j]);
    if (!basic(input[j])) return punycode_bad_input;
    output[out++] = input[j];
  }

  /* Main decoding loop:  Start just after the last delimiter if any  */
  /* basic code points were copied; start at the beginning otherwise. */

  for (in = b > 0 ? b + 1 : 0;  in < input_length;  ++out) {

    /* in is the index of the next character to be consumed, and */
    /* out is the number of code points in the output array.     */

    /* Decode a generalized variable-length integer into delta,  */
    /* which gets added to i.  The overflow checking is easier   */
    /* if we increase i as we go, then subtract off its starting */
    /* value at the end to obtain delta.                         */

    for (oldi = i, w = 1, k = base;  ;  k += base) {
      if (in >= input_length) return punycode_bad_input;
      digit = decode_digit(input[in++]);
      if (digit >= base) return punycode_bad_input;
      if (digit > (maxint - i) / w) return punycode_overflow;
      i += digit * w;
      t = k <= bias /* + tmin */ ? tmin :     /* +tmin not needed */
          k >= bias + tmax ? tmax : k - bias;
      if (digit < t) break;
      if (w > maxint / (base - t)) return punycode_overflow;
      w *= (base - t);
    }

    bias = adapt(i - oldi, out + 1, oldi == 0);

    /* i was supposed to wrap around from out+1 to 0,   */
    /* incrementing n each time, so we'll fix that now: */

    if (i / (out + 1) > maxint - n) return punycode_overflow;
    n += i / (out + 1);
    i %= (out + 1);

    /* Insert n at position i of the output: */

    /* not needed for Punycode: */
    /* if (decode_digit(n) <= base) return punycode_invalid_input; */
    if (out >= max_out) return punycode_big_output;

    if (case_flags) {
      memmove(case_flags + i + 1, case_flags + i, out - i);
      /* Case of last character determines uppercase flag: */
      case_flags[i] = flagged(input[in - 1]);
    }

    memmove(output + i + 1, output + i, (out - i) * sizeof *output);
    output[i++] = n;
  }

  *output_length = out;
  return punycode_success;
}

U_CDECL_END

#endif /* #if !UCONFIG_NO_IDNA */
Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f A	3	/*
	4	*******************************************************************************
	5	*
73c04bcf	6	* Copyright (C) 2003-2005, International Business Machines
b75a7d8f A	7	* Corporation and others. All Rights Reserved.
	8	*
	9	*******************************************************************************
f3c0d7a5 A	10	* file name: punyref.cpp
f3c0d7a5 A	11	* encoding: UTF-8
b75a7d8f A	12	* tab size: 8 (not used)
	13	* indentation:4
	14	*
	15	* created on: 2003feb1
	16	* created by: Ram Viswanadha
	17	*/
	18
	19	/*
	20	Disclaimer and license
	21
	22	Regarding this entire document or any portion of it (including
	23	the pseudocode and C code), the author makes no guarantees and
	24	is not responsible for any damage resulting from its use. The
	25	author grants irrevocable permission to anyone to use, modify,
	26	and distribute it in any way that does not diminish the rights
	27	of anyone else to use, modify, and distribute it, provided that
	28	redistributed derivative works do not contain misleading author or
	29	version information. Derivative works need not be licensed under
	30	similar terms.
	31
	32	punycode.c 0.4.0 (2001-Nov-17-Sat)
	33	http://www.cs.berkeley.edu/~amc/idn/
	34	Adam M. Costello
	35	http://www.nicemice.net/amc/
	36	*/
	37
	38	/**********************************************************/
	39	/* Implementation (would normally go in its own .c file): */
	40
	41	#include <string.h>
	42
	43	#include "unicode/utypes.h"
	44
	45	#if !UCONFIG_NO_IDNA
	46
	47	#include "punyref.h"
	48
	49	/* Bootstring parameters for Punycode */
	50
	51	enum { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
	52	initial_bias = 72, initial_n = 0x80, delimiter = 0x2D };
	53
	54	/* basic(cp) tests whether cp is a basic code point: */
	55	#define basic(cp) ((punycode_uint)(cp) < 0x80)
	56
	57	/* delim(cp) tests whether cp is a delimiter: */
	58	#define delim(cp) ((cp) == delimiter)
	59
	60	U_CDECL_BEGIN
	61	/* decode_digit(cp) returns the numeric value of a basic code */
	62	/* point (for use in representing integers) in the range 0 to */
	63	/* base-1, or base if cp is does not represent a value. */
	64
	65	static punycode_uint decode_digit(punycode_uint cp)
	66	{
	67	return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
	68	cp - 97 < 26 ? cp - 97 : base;
	69	}
	70
	71	/* encode_digit(d,flag) returns the basic code point whose value */
	72	/* (when used for representing integers) is d, which needs to be in */
	73	/* the range 0 to base-1. The lowercase form is used unless flag is */
	74	/* nonzero, in which case the uppercase form is used. The behavior */
	75	/* is undefined if flag is nonzero and digit d has no uppercase form. */
76
77	static char encode_digit(punycode_uint d, int flag)
78	{
79	return (char) d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
80	/* 0..25 map to ASCII a..z or A..Z */
81	/* 26..35 map to ASCII 0..9 */
82	}
83
84	/* flagged(bcp) tests whether a basic code point is flagged */
85	/* (uppercase). The behavior is undefined if bcp is not a */
86	/* basic code point. */
87
88	#define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
89
90	/* encode_basic(bcp,flag) forces a basic code point to lowercase */
91	/* if flag is zero, uppercase if flag is nonzero, and returns */
92	/* the resulting code point. The code point is unchanged if it */
93	/* is caseless. The behavior is undefined if bcp is not a basic */
94	/* code point. */
95
96	static char encode_basic(punycode_uint bcp, int flag)
97	{
98	bcp -= (bcp - 97 < 26) << 5;
99	return (char) bcp + ((!flag && (bcp - 65 < 26)) << 5);
100	}
101
102	/* Platform-specific constants */
103
104	/* maxint is the maximum value of a punycode_uint variable: */
105	static const punycode_uint maxint = (punycode_uint) (-1);
106	/* Because maxint is unsigned, -1 becomes the maximum value. */
107
108	/* Bias adaptation function */
109
110	static punycode_uint adapt(
111	punycode_uint delta, punycode_uint numpoints, int firsttime )
112	{
113	punycode_uint k;
114
115	delta = firsttime ? delta / damp : delta >> 1;
116	/* delta >> 1 is a faster way of doing delta / 2 */
117	delta += delta / numpoints;
118
119	for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) {
120	delta /= base - tmin;
121	}
122
123	return k + (base - tmin + 1) * delta / (delta + skew);
124	}
125
126	/* Main encode function */
127
128	enum punycode_status punycode_encode(
129	punycode_uint input_length,
130	const punycode_uint input[],
131	const unsigned char case_flags[],
132	punycode_uint *output_length,
133	char output[] )
134	{
135	punycode_uint n, delta, h, b, out, max_out, bias, j, m, q, k, t;
136
137	/* Initialize the state: */
138
139	n = initial_n;
140	delta = out = 0;
141	max_out = *output_length;
142	bias = initial_bias;
143
144	/* Handle the basic code points: */
145
146	for (j = 0; j < input_length; ++j) {
147	if (basic(input[j])) {
148	if (max_out - out < 2) return punycode_big_output;
149	output[out++] = (char)
150	(case_flags ? encode_basic(input[j], case_flags[j]) : input[j]);
151	}
152	/* else if (input[j] < n) return punycode_bad_input; */
153	/* (not needed for Punycode with unsigned code points) */
154	}
155
156	h = b = out;
157
158	/* h is the number of code points that have been handled, b is the */
159	/* number of basic code points, and out is the number of characters */
160	/* that have been output. */
161
162	if (b > 0) output[out++] = delimiter;
163
164	/* Main encoding loop: */
165
166	while (h < input_length) {
167	/* All non-basic code points < n have been */
168	/* handled already. Find the next larger one: */
169
170	for (m = maxint, j = 0; j < input_length; ++j) {
171	/* if (basic(input[j])) continue; */
172	/* (not needed for Punycode) */
173	if (input[j] >= n && input[j] < m) m = input[j];
174	}
175
176	/* Increase delta enough to advance the decoder's */
177	/* <n,i> state to <m,0>, but guard against overflow: */
178
179	if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow;
180	delta += (m - n) * (h + 1);
181	n = m;
182
183	for (j = 0; j < input_length; ++j) {
184	/* Punycode does not need to check whether input[j] is basic: */
185	if (input[j] < n /* \|\| basic(input[j]) */ ) {
186	if (++delta == 0) return punycode_overflow;
187	}
188
189	if (input[j] == n) {
190	/* Represent delta as a generalized variable-length integer: */
191
192	for (q = delta, k = base; ; k += base) {
193	if (out >= max_out) return punycode_big_output;
194	t = k <= bias /* + tmin / ? tmin : / +tmin not needed */
195	k >= bias + tmax ? tmax : k - bias;
196	if (q < t) break;
197	output[out++] = encode_digit(t + (q - t) % (base - t), 0);
198	q = (q - t) / (base - t);
199	}
200
201	output[out++] = encode_digit(q, case_flags && case_flags[j]);
202	bias = adapt(delta, h + 1, h == b);
203	delta = 0;
204	++h;
205	}
206	}
207
208	++delta, ++n;
209	}
210
211	*output_length = out;
212	return punycode_success;
213	}
214
215	/* Main decode function */
216
217	enum punycode_status punycode_decode(
218	punycode_uint input_length,
219	const char input[],
220	punycode_uint *output_length,
221	punycode_uint output[],
222	unsigned char case_flags[] )
223	{
224	punycode_uint n, out, i, max_out, bias,
225	b, j, in, oldi, w, k, digit, t;
226
227	/* Initialize the state: */
228
229	n = initial_n;
230	out = i = 0;
231	max_out = *output_length;
232	bias = initial_bias;
233
234	/* Handle the basic code points: Let b be the number of input code */
235	/* points before the last delimiter, or 0 if there is none, then */
236	/* copy the first b code points to the output. */
237
238	for (b = j = 0; j < input_length; ++j) if (delim(input[j])) b = j;
239	if (b > max_out) return punycode_big_output;
240
241	for (j = 0; j < b; ++j) {
242	if (case_flags) case_flags[out] = flagged(input[j]);
243	if (!basic(input[j])) return punycode_bad_input;
244	output[out++] = input[j];
245	}
246
247	/* Main decoding loop: Start just after the last delimiter if any */
248	/* basic code points were copied; start at the beginning otherwise. */
249
250	for (in = b > 0 ? b + 1 : 0; in < input_length; ++out) {
251
252	/* in is the index of the next character to be consumed, and */
253	/* out is the number of code points in the output array. */
254
255	/* Decode a generalized variable-length integer into delta, */
256	/* which gets added to i. The overflow checking is easier */
257	/* if we increase i as we go, then subtract off its starting */
258	/* value at the end to obtain delta. */
259
260	for (oldi = i, w = 1, k = base; ; k += base) {
261	if (in >= input_length) return punycode_bad_input;
262	digit = decode_digit(input[in++]);
263	if (digit >= base) return punycode_bad_input;
264	if (digit > (maxint - i) / w) return punycode_overflow;
265	i += digit * w;
266	t = k <= bias /* + tmin / ? tmin : / +tmin not needed */
267	k >= bias + tmax ? tmax : k - bias;
268	if (digit < t) break;
269	if (w > maxint / (base - t)) return punycode_overflow;
270	w *= (base - t);
271	}
272
273	bias = adapt(i - oldi, out + 1, oldi == 0);
274
275	/* i was supposed to wrap around from out+1 to 0, */
276	/* incrementing n each time, so we'll fix that now: */
277
278	if (i / (out + 1) > maxint - n) return punycode_overflow;
279	n += i / (out + 1);
280	i %= (out + 1);
281
282	/* Insert n at position i of the output: */
283
284	/* not needed for Punycode: */
285	/* if (decode_digit(n) <= base) return punycode_invalid_input; */
286	if (out >= max_out) return punycode_big_output;
287
288	if (case_flags) {
289	memmove(case_flags + i + 1, case_flags + i, out - i);
290	/* Case of last character determines uppercase flag: */
291	case_flags[i] = flagged(input[in - 1]);
292	}
293
294	memmove(output + i + 1, output + i, (out - i) * sizeof *output);
295	output[i++] = n;
296	}
297
298	*output_length = out;
299	return punycode_success;
300	}
301
302	U_CDECL_END
303
304	#endif /* #if !UCONFIG_NO_IDNA */