git.saurik.com Git - apple/icu.git/blame - icuSources/extra/uconv/uconv.1.in

Commit	Line	Data
b75a7d8f A	1	.\" Hey, Emacs! This is --nroff-- you know...
	2	.\"
	3	.\" uconv.1: manual page for the uconv utility.
	4	.\"
f3c0d7a5 A	5	.\" Copyright (C) 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	6	.\" License & terms of use: http://www.unicode.org/copyright.html
57a6839d	7	.\" Copyright (C) 2000-2013 IBM, Inc. and others.
b75a7d8f A	8	.\"
	9	.\" Manual page by Yves Arrouye <yves@realnames.com>.
	10	.\"
73c04bcf	11	.TH UCONV 1 "2005-jul-1" "ICU MANPAGE" "ICU @VERSION@ Manual"
b75a7d8f A	12	.SH NAME
	13	.B uconv
	14	\- convert data from one encoding to another
	15	.SH SYNOPSIS
	16	.B uconv
	17	[
	18	.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
	19	]
	20	[
	21	.BI "\-V\fP, \fB\-\-version"
	22	]
	23	[
	24	.BI "\-s\fP, \fB\-\-silent"
	25	]
	26	[
	27	.BI "\-v\fP, \fB\-\-verbose"
	28	]
	29	[
	30	.BI "\-l\fP, \fB\-\-list"
	31	\|
	32	.BI "\-l\fP, \fB\-\-list\-code" " code"
	33	\|
	34	.BI "\-\-default-code"
	35	\|
	36	.BI "\-L\fP, \fB\-\-list\-transliterators"
	37	]
	38	[
	39	.BI "\-\-canon"
	40	]
	41	[
	42	.BI "\-x" " transliteration
	43	]
	44	[
	45	.BI "\-\-to\-callback" " callback"
	46	\|
	47	.B "\-c"
	48	]
	49	[
	50	.BI "\-\-from\-callback" " callback"
	51	\|
	52	.B "\-i"
	53	]
	54	[
	55	.BI "\-\-callback" " callback"
	56	]
	57	[
	58	.BI "\-\-fallback"
	59	\|
	60	.BI "\-\-no\-fallback"
	61	]
	62	[
	63	.BI "\-b\fP, \fB\-\-block\-size" " size"
	64	]
	65	[
	66	.BI "\-f\fP, \fB\-\-from\-code" " encoding"
	67	]
	68	[
	69	.BI "\-t\fP, \fB\-\-to\-code" " encoding"
	70	]
	71	[
374ca955 A	72	.BI "\-\-add\-signature"
	73	]
	74	[
	75	.BI "\-\-remove\-signature"
	76	]
	77	[
b75a7d8f A	78	.BI "\-o\fP, \fB\-\-output" " file"
	79	]
	80	[
	81	.IR file .\\|.\\|.
	82	]
	83	.SH DESCRIPTION
	84	.B uconv
	85	converts, or transcodes, each given
	86	.I file
	87	(or its standard input if no
	88	.I file
	89	is specified) from one
	90	.I encoding
	91	to another.
	92	The transcoding is done using Unicode as a pivot encoding
	93	(i.e. the data are first transcoded from their original encoding to
	94	Unicode, and then from Unicode to the destination encoding).
	95	.PP
	96	If an
	97	.I encoding
	98	is not specified or is
	99	.BR - ,
	100	the default encoding is used. Thus, calling
	101	.B uconv
	102	with no
	103	.I encoding
	104	provides an easy way to validate and sanitize data files for
	105	further consumption by tools requiring data in the default encoding.
	106	.PP
	107	When calling
	108	.BR uconv ,
	109	it is possible to specify callbacks that are used to handle invalid
	110	characters in the input, or characters that cannot be transcoded to
	111	the destination encoding. Some encodings, for example, offer a default
0f5d89e8	112	substitution character that can be used to represent the occurrence of
b75a7d8f A	113	such characters in the input. Other callbacks offer a useful visual
	114	representation of the invalid data.
	115	.PP
	116	.B uconv
	117	can also run the specified
	118	.IR transliteration
	119	on the transcoded data,
	120	in which case transliteration will happen as an intermediate step,
	121	after the data have been transcoded to Unicode.
	122	The
	123	.I transliteration
	124	can be either a list of semicolon-separated transliterator names,
374ca955	125	or an arbitrarily complex set of rules in the ICU transliteration
b75a7d8f A	126	rules format.
	127	.PP
	128	For transcoding purposes,
	129	.B uconv
	130	options are compatible with those of
	131	.BR iconv (1),
374ca955	132	making it easy to replace it in scripts. It is not necessarily the case,
b75a7d8f A	133	however, that the encoding names used by
	134	.B uconv
	135	and ICU are the same as the ones used by
	136	.BR iconv (1).
	137	Also, options that provide informational data, such as the
	138	.B \-l\fP, \fB\-\-list
	139	one offered by some
	140	.BR iconv (1)
	141	variants such as GNU's, produce data in a slightly different and
	142	easier to parse format.
	143	.SH OPTIONS
	144	.TP
	145	.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
	146	Print help about usage and exit.
	147	.TP
	148	.BR "\-V\fP, \fB\-\-version"
	149	Print the version of
	150	.B uconv
	151	and exit.
	152	.TP
	153	.BI "\-s\fP, \fB\-\-silent"
	154	Suppress messages during execution.
	155	.TP
	156	.BI "\-v\fP, \fB\-\-verbose"
	157	Display extra informative messages during execution.
	158	.TP
	159	.BI "\-l\fP, \fB\-\-list"
	160	List all the available encodings and exit.
	161	.TP
	162	.BI "\-l\fP, \fB\-\-list\-code" " code"
	163	List only the
	164	.I code
	165	encoding and exit. If
	166	.I code
	167	is not a proper encoding, exit with an error.
	168	.TP
	169	.BI "\-\-default-code"
	170	List only the name of the default encoding and exit.
	171	.TP
	172	.BI "\-L\fP, \fB\-\-list\-transliterators"
	173	List all the available transliterators and exit.
	174	.TP
	175	.BI "\--canon"
	176	If used with
	177	.BI "\-l\fP, \fB\-\-list"
	178	or
	179	.BR "\-\-default-code" ,
	180	the list of encodings is produced in a format compatible with
	181	.BR convrtrs.txt (5).
	182	If used with
	183	.BR "\-L\fP, \fB\-\-list\-transliterators" ,
	184	print only one transliterator name per line.
	185	.TP
	186	.BI "\-x" " transliteration"
	187	Run the given
	188	.IR transliteration
	189	on the transcoded Unicode data,
	190	and use the transliterated data as input for the transcoding to
0f5d89e8	191	the destination encoding.
b75a7d8f A	192	.TP
	193	.BI "\-\-to\-callback" " callback"
	194	Use
	195	.I callback
	196	to handle characters that cannot be transcoded to the destination
	197	encoding. See section
	198	.B CALLBACKS
	199	for details on valid callbacks.
	200	.TP
	201	.B "\-c"
	202	Omit invalid characters from the output.
	203	Same as
	204	.BR "\-\-to\-callback skip" .
	205	.TP
	206	.BI "\-\-from\-callback" " callback"
	207	Use
	208	.I callback
	209	to handle characters that cannot be transcoded from the original
	210	encoding. See section
	211	.B CALLBACKS
	212	for details on valid callbacks.
	213	.TP
	214	.B "\-i"
	215	Ignore invalid sequences in the input.
	216	Same as
	217	.BR "\-\-from\-callback skip" .
	218	.TP
	219	.BI "\-\-callback" " callback"
	220	Use
	221	.I callback
	222	to handle both characters that cannot be transcoded from the original
	223	encoding and characters that cannot be transcoded to the destination
	224	encoding. See section
	225	.B CALLBACKS
	226	for details on valid callbacks.
	227	.TP
	228	.BI "\-\-fallback"
	229	Use the fallback mapping when transcoding from
	230	Unicode to the destination encoding.
	231	.TP
	232	.BI "\-\-no\-fallback"
	233	Do not use the fallback mapping when transcoding from Unicode to the
	234	destination encoding.
	235	This is the default.
	236	.TP
	237	.BI "\-b\fP, \fB\-\-block\-size" " size"
	238	Read input in blocks of
	239	.I size
	240	bytes at a time. The default block size is
	241	4096.
	242	.TP
	243	.BI "\-f\fP, \fB\-\-from\-code" " encoding"
	244	Set the original encoding of the data to
	245	.IR encoding .
	246	.TP
	247	.BI "\-t\fP, \fB\-\-to\-code" " encoding"
	248	Transcode the data to
	249	.IR encoding .
	250	.TP
374ca955 A	251	.BI "\-\-add\-signature"
	252	Add a U+FEFF Unicode signature character (BOM) if the output charset
	253	supports it and does not add one anyway.
	254	.TP
	255	.BI "\-\-remove\-signature"
	256	Remove a U+FEFF Unicode signature character (BOM).
	257	.TP
b75a7d8f A	258	.BI "\-o\fP, \fB\-\-output" " file"
	259	Write the transcoded data to
	260	.IR file .
	261	.SH CALLBACKS
	262	.B uconv
	263	supports specifying callbacks to handle invalid data. Callbacks can be
	264	set for both directions of transcoding: from the original encoding to
	265	Unicode, with the
	266	.BR "\-\-from\-callback"
	267	option, and from Unicode to the destination encoding, with the
	268	.BR "\-\-to\-callback"
	269	option.
	270	.PP
	271	The following is a list of valid
	272	.I callback
729e4ab9	273	names, along with a description of their behavior. The list of
b75a7d8f A	274	callbacks actually supported by
	275	.B uconv
	276	is displayed when it is called with
	277	.BR "\-h\fP, \fB\-\-help" .
	278	.PP
	279	.TP \w'\fBescape-unicode'u+3n
	280	.B substitute
0f5d89e8	281	Write the encoding's substitute sequence, or the Unicode
b75a7d8f A	282	replacement character
	283	.B U+FFFD
	284	when transcoding to Unicode.
	285	.TP
	286	.B skip
	287	Ignore the invalid data.
	288	.TP
	289	.B stop
	290	Stop with an error when encountering invalid data.
	291	This is the default callback.
	292	.TP
	293	.B escape
	294	Same as
	295	.BR escape-icu .
	296	.TP
	297	.B escape-icu
	298	Replace the missing characters with a string of the format
	299	.BR %U\fIhhhh\fP
	300	for plane 0 characters, and
	301	.BR %U\fIhhhh\fP%U\fIhhhh\fP
	302	for planes 1 and above characters,
	303	where
	304	.I hhhh
	305	is the hexadecimal value of one of the UTF-16 code units representing the
	306	character. Characters from planes 1 and above are written as a pair of
	307	UTF-16 surrogate code units.
	308	.TP
	309	.B escape-java
	310	Replace the missing characters with a string of the format
	311	.BR \eu\fIhhhh\fP
	312	for plane 0 characters, and
	313	.BR \eu\fIhhhh\fP\eu\fIhhhh\fP
	314	for planes 1 and above characters,
	315	where
	316	.I hhhh
	317	is the hexadecimal value of one of the UTF-16 code units representing the
	318	character. Characters from planes 1 and above are written as a pair of
	319	UTF-16 surrogate code units.
	320	.TP
	321	.B escape-c
	322	Replace the missing characters with a string of the format
	323	.BR \eu\fIhhhh\fP
	324	for plane 0 characters, and
	325	.BR \eU\fIhhhhhhhh\fP
	326	for planes 1 and above characters,
	327	where
	328	.I hhhh
	329	and
	330	.I hhhhhhhh
	331	are the hexadecimal values of the Unicode codepoint.
	332	.TP
	333	.B escape-xml
	334	Same as
	335	.BR escape-xml-hex .
	336	.TP
	337	.B escape-xml-hex
	338	Replace the missing characters with a string of the format
	339	.BR &#x\fIhhhh\fP; ,
	340	where
	341	.I hhhh
	342	is the hexadecimal value of the Unicode codepoint.
	343	.TP
	344	.B escape-xml-dec
	345	Replace the missing characters with a string of the format
57a6839d	346	.BR &#\fInnnn\fP; ,
b75a7d8f A	347	where
	348	.I nnnn
	349	is the decimal value of the Unicode codepoint.
	350	.TP
	351	.B escape-unicode
	352	Replace the missing characters with a string of the format
	353	.BR {U+\fIhhhh\fP} ,
	354	where
	355	.I hhhh
	356	is the hexadecimal value of the Unicode codepoint.
	357	That hexadecimal string is of variable length and can use from 4 to
	358	6 digits.
	359	This is the format universally used to denote a Unicode codepoint in
0f5d89e8	360	the literature, delimited by curly braces for easy recognition of those
b75a7d8f A	361	substitutions in the output.
	362	.SH EXAMPLES
	363	Convert data from a given
	364	.I encoding
	365	to the platform encoding:
	366
	367	.RS 4
	368	.B \fR$ \fPuconv \-f \fIencoding\fP
	369	.RE
	370	.PP
	371	Check if a
	372	.I file
	373	contains valid data for a given
	374	.IR encoding :
	375
	376	.RS 4
	377	.B \fR$ \fPuconv \-f \fIencoding\fP \-c \fIfile\fP >/dev/null
	378	.RE
	379	.PP
	380	Convert a UTF-8
	381	.I file
	382	to a given
	383	.I encoding
	384	and ensure that the resulting text is good for any version of HTML:
	385
	386	.RS 4
	387	.B \fR$ \fPuconv \-f utf-8 \-t \fIencoding\fP \e
	388	.br
	389	.B " \-\-callback escape-xml-dec \fIfile\fP"
	390	.RE
	391	.PP
	392	Display the names of the Unicode code points in a UTF-file:
	393
	394	.RS 4
	395	.B \fR$ \fPuconv \-f utf-8 \-x any-name \fIfile\fP
	396	.RE
	397	.PP
	398	Print the name of a Unicode code point whose value is known (\fBU+30AB\fP
	399	in this example):
	400
	401	.RS 4
	402	.B \fR$ \fPecho '\eu30ab' \| uconv \-x 'hex-any; any-name'; echo
	403	.br
	404	{KATAKANA LETTER KA}{LINE FEED}
	405	.br
	406	$
	407	.RE
	408
	409	(The names are delimited by curly braces.
	410	Also, the name of the line terminator is also displayed.)
	411	.PP
	412	Normalize UTF-8 data using Unicode NFKC, remove all control characters,
	413	and map Katakana to Hiragana:
	414
	415	.RS 4
	416	.B \fR$ \fPuconv \-f utf-8 \-t utf-8 \e
	417	.br
	418	.B " \-x '::nfkc; [:Cc:] >; ::katakana-hiragana;'"
	419	.SH CAVEATS AND BUGS
	420	.B uconv
0f5d89e8	421	does report errors as occurring at the first invalid byte
b75a7d8f A	422	encountered. This may be confusing to users of GNU
b75a7d8f A	423	.BR iconv (1),
0f5d89e8	424	which reports errors as occurring at the first byte of an invalid
b75a7d8f A	425	sequence. For multi-byte character sets or encodings, this means that
	426	.BR uconv
	427	error positions may be at a later offset in the input stream than
	428	would be the case with GNU
	429	.BR iconv (1).
	430	.PP
	431	The reporting of error positions when a transliterator is used may be
	432	inaccurate or unavailable, in which case
	433	.BR uconv
	434	will report the offset in the output stream at which the error
0f5d89e8	435	occurred.
b75a7d8f A	436	.SH AUTHORS
	437	Jonas Utterstroem
	438	.br
	439	Yves Arrouye
	440	.SH VERSION
	441	@VERSION@
	442	.SH COPYRIGHT
73c04bcf	443	Copyright (C) 2000-2005 IBM, Inc. and others.
b75a7d8f	444	.SH SEE ALSO
b75a7d8f	445	.BR iconv (1)