]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | .\" Hey, Emacs! This is -*-nroff-*- you know... |
2 | .\" | |
3 | .\" uconv.1: manual page for the uconv utility. | |
4 | .\" | |
73c04bcf | 5 | .\" Copyright (C) 2000-2005 IBM, Inc. and others. |
b75a7d8f A |
6 | .\" |
7 | .\" Manual page by Yves Arrouye <yves@realnames.com>. | |
8 | .\" | |
73c04bcf | 9 | .TH UCONV 1 "2005-jul-1" "ICU MANPAGE" "ICU @VERSION@ Manual" |
b75a7d8f A |
10 | .SH NAME |
11 | .B uconv | |
12 | \- convert data from one encoding to another | |
13 | .SH SYNOPSIS | |
14 | .B uconv | |
15 | [ | |
16 | .BR "\-h\fP, \fB\-?\fP, \fB\-\-help" | |
17 | ] | |
18 | [ | |
19 | .BI "\-V\fP, \fB\-\-version" | |
20 | ] | |
21 | [ | |
22 | .BI "\-s\fP, \fB\-\-silent" | |
23 | ] | |
24 | [ | |
25 | .BI "\-v\fP, \fB\-\-verbose" | |
26 | ] | |
27 | [ | |
28 | .BI "\-l\fP, \fB\-\-list" | |
29 | | | |
30 | .BI "\-l\fP, \fB\-\-list\-code" " code" | |
31 | | | |
32 | .BI "\-\-default-code" | |
33 | | | |
34 | .BI "\-L\fP, \fB\-\-list\-transliterators" | |
35 | ] | |
36 | [ | |
37 | .BI "\-\-canon" | |
38 | ] | |
39 | [ | |
40 | .BI "\-x" " transliteration | |
41 | ] | |
42 | [ | |
43 | .BI "\-\-to\-callback" " callback" | |
44 | | | |
45 | .B "\-c" | |
46 | ] | |
47 | [ | |
48 | .BI "\-\-from\-callback" " callback" | |
49 | | | |
50 | .B "\-i" | |
51 | ] | |
52 | [ | |
53 | .BI "\-\-callback" " callback" | |
54 | ] | |
55 | [ | |
56 | .BI "\-\-fallback" | |
57 | | | |
58 | .BI "\-\-no\-fallback" | |
59 | ] | |
60 | [ | |
61 | .BI "\-b\fP, \fB\-\-block\-size" " size" | |
62 | ] | |
63 | [ | |
64 | .BI "\-f\fP, \fB\-\-from\-code" " encoding" | |
65 | ] | |
66 | [ | |
67 | .BI "\-t\fP, \fB\-\-to\-code" " encoding" | |
68 | ] | |
69 | [ | |
374ca955 A |
70 | .BI "\-\-add\-signature" |
71 | ] | |
72 | [ | |
73 | .BI "\-\-remove\-signature" | |
74 | ] | |
75 | [ | |
b75a7d8f A |
76 | .BI "\-o\fP, \fB\-\-output" " file" |
77 | ] | |
78 | [ | |
79 | .IR file .\|.\|. | |
80 | ] | |
81 | .SH DESCRIPTION | |
82 | .B uconv | |
83 | converts, or transcodes, each given | |
84 | .I file | |
85 | (or its standard input if no | |
86 | .I file | |
87 | is specified) from one | |
88 | .I encoding | |
89 | to another. | |
90 | The transcoding is done using Unicode as a pivot encoding | |
91 | (i.e. the data are first transcoded from their original encoding to | |
92 | Unicode, and then from Unicode to the destination encoding). | |
93 | .PP | |
94 | If an | |
95 | .I encoding | |
96 | is not specified or is | |
97 | .BR - , | |
98 | the default encoding is used. Thus, calling | |
99 | .B uconv | |
100 | with no | |
101 | .I encoding | |
102 | provides an easy way to validate and sanitize data files for | |
103 | further consumption by tools requiring data in the default encoding. | |
104 | .PP | |
105 | When calling | |
106 | .BR uconv , | |
107 | it is possible to specify callbacks that are used to handle invalid | |
108 | characters in the input, or characters that cannot be transcoded to | |
109 | the destination encoding. Some encodings, for example, offer a default | |
110 | substitution character that can be used to represent the occurence of | |
111 | such characters in the input. Other callbacks offer a useful visual | |
112 | representation of the invalid data. | |
113 | .PP | |
114 | .B uconv | |
115 | can also run the specified | |
116 | .IR transliteration | |
117 | on the transcoded data, | |
118 | in which case transliteration will happen as an intermediate step, | |
119 | after the data have been transcoded to Unicode. | |
120 | The | |
121 | .I transliteration | |
122 | can be either a list of semicolon-separated transliterator names, | |
374ca955 | 123 | or an arbitrarily complex set of rules in the ICU transliteration |
b75a7d8f A |
124 | rules format. |
125 | .PP | |
126 | For transcoding purposes, | |
127 | .B uconv | |
128 | options are compatible with those of | |
129 | .BR iconv (1), | |
374ca955 | 130 | making it easy to replace it in scripts. It is not necessarily the case, |
b75a7d8f A |
131 | however, that the encoding names used by |
132 | .B uconv | |
133 | and ICU are the same as the ones used by | |
134 | .BR iconv (1). | |
135 | Also, options that provide informational data, such as the | |
136 | .B \-l\fP, \fB\-\-list | |
137 | one offered by some | |
138 | .BR iconv (1) | |
139 | variants such as GNU's, produce data in a slightly different and | |
140 | easier to parse format. | |
141 | .SH OPTIONS | |
142 | .TP | |
143 | .BR "\-h\fP, \fB\-?\fP, \fB\-\-help" | |
144 | Print help about usage and exit. | |
145 | .TP | |
146 | .BR "\-V\fP, \fB\-\-version" | |
147 | Print the version of | |
148 | .B uconv | |
149 | and exit. | |
150 | .TP | |
151 | .BI "\-s\fP, \fB\-\-silent" | |
152 | Suppress messages during execution. | |
153 | .TP | |
154 | .BI "\-v\fP, \fB\-\-verbose" | |
155 | Display extra informative messages during execution. | |
156 | .TP | |
157 | .BI "\-l\fP, \fB\-\-list" | |
158 | List all the available encodings and exit. | |
159 | .TP | |
160 | .BI "\-l\fP, \fB\-\-list\-code" " code" | |
161 | List only the | |
162 | .I code | |
163 | encoding and exit. If | |
164 | .I code | |
165 | is not a proper encoding, exit with an error. | |
166 | .TP | |
167 | .BI "\-\-default-code" | |
168 | List only the name of the default encoding and exit. | |
169 | .TP | |
170 | .BI "\-L\fP, \fB\-\-list\-transliterators" | |
171 | List all the available transliterators and exit. | |
172 | .TP | |
173 | .BI "\--canon" | |
174 | If used with | |
175 | .BI "\-l\fP, \fB\-\-list" | |
176 | or | |
177 | .BR "\-\-default-code" , | |
178 | the list of encodings is produced in a format compatible with | |
179 | .BR convrtrs.txt (5). | |
180 | If used with | |
181 | .BR "\-L\fP, \fB\-\-list\-transliterators" , | |
182 | print only one transliterator name per line. | |
183 | .TP | |
184 | .BI "\-x" " transliteration" | |
185 | Run the given | |
186 | .IR transliteration | |
187 | on the transcoded Unicode data, | |
188 | and use the transliterated data as input for the transcoding to | |
189 | the the destination encoding. | |
190 | .TP | |
191 | .BI "\-\-to\-callback" " callback" | |
192 | Use | |
193 | .I callback | |
194 | to handle characters that cannot be transcoded to the destination | |
195 | encoding. See section | |
196 | .B CALLBACKS | |
197 | for details on valid callbacks. | |
198 | .TP | |
199 | .B "\-c" | |
200 | Omit invalid characters from the output. | |
201 | Same as | |
202 | .BR "\-\-to\-callback skip" . | |
203 | .TP | |
204 | .BI "\-\-from\-callback" " callback" | |
205 | Use | |
206 | .I callback | |
207 | to handle characters that cannot be transcoded from the original | |
208 | encoding. See section | |
209 | .B CALLBACKS | |
210 | for details on valid callbacks. | |
211 | .TP | |
212 | .B "\-i" | |
213 | Ignore invalid sequences in the input. | |
214 | Same as | |
215 | .BR "\-\-from\-callback skip" . | |
216 | .TP | |
217 | .BI "\-\-callback" " callback" | |
218 | Use | |
219 | .I callback | |
220 | to handle both characters that cannot be transcoded from the original | |
221 | encoding and characters that cannot be transcoded to the destination | |
222 | encoding. See section | |
223 | .B CALLBACKS | |
224 | for details on valid callbacks. | |
225 | .TP | |
226 | .BI "\-\-fallback" | |
227 | Use the fallback mapping when transcoding from | |
228 | Unicode to the destination encoding. | |
229 | .TP | |
230 | .BI "\-\-no\-fallback" | |
231 | Do not use the fallback mapping when transcoding from Unicode to the | |
232 | destination encoding. | |
233 | This is the default. | |
234 | .TP | |
235 | .BI "\-b\fP, \fB\-\-block\-size" " size" | |
236 | Read input in blocks of | |
237 | .I size | |
238 | bytes at a time. The default block size is | |
239 | 4096. | |
240 | .TP | |
241 | .BI "\-f\fP, \fB\-\-from\-code" " encoding" | |
242 | Set the original encoding of the data to | |
243 | .IR encoding . | |
244 | .TP | |
245 | .BI "\-t\fP, \fB\-\-to\-code" " encoding" | |
246 | Transcode the data to | |
247 | .IR encoding . | |
248 | .TP | |
374ca955 A |
249 | .BI "\-\-add\-signature" |
250 | Add a U+FEFF Unicode signature character (BOM) if the output charset | |
251 | supports it and does not add one anyway. | |
252 | .TP | |
253 | .BI "\-\-remove\-signature" | |
254 | Remove a U+FEFF Unicode signature character (BOM). | |
255 | .TP | |
b75a7d8f A |
256 | .BI "\-o\fP, \fB\-\-output" " file" |
257 | Write the transcoded data to | |
258 | .IR file . | |
259 | .SH CALLBACKS | |
260 | .B uconv | |
261 | supports specifying callbacks to handle invalid data. Callbacks can be | |
262 | set for both directions of transcoding: from the original encoding to | |
263 | Unicode, with the | |
264 | .BR "\-\-from\-callback" | |
265 | option, and from Unicode to the destination encoding, with the | |
266 | .BR "\-\-to\-callback" | |
267 | option. | |
268 | .PP | |
269 | The following is a list of valid | |
270 | .I callback | |
271 | names, alonmg with a description of their behavior. The list of | |
272 | callbacks actually supported by | |
273 | .B uconv | |
274 | is displayed when it is called with | |
275 | .BR "\-h\fP, \fB\-\-help" . | |
276 | .PP | |
277 | .TP \w'\fBescape-unicode'u+3n | |
278 | .B substitute | |
279 | Write the the encoding's substitute sequence, or the Unicode | |
280 | replacement character | |
281 | .B U+FFFD | |
282 | when transcoding to Unicode. | |
283 | .TP | |
284 | .B skip | |
285 | Ignore the invalid data. | |
286 | .TP | |
287 | .B stop | |
288 | Stop with an error when encountering invalid data. | |
289 | This is the default callback. | |
290 | .TP | |
291 | .B escape | |
292 | Same as | |
293 | .BR escape-icu . | |
294 | .TP | |
295 | .B escape-icu | |
296 | Replace the missing characters with a string of the format | |
297 | .BR %U\fIhhhh\fP | |
298 | for plane 0 characters, and | |
299 | .BR %U\fIhhhh\fP%U\fIhhhh\fP | |
300 | for planes 1 and above characters, | |
301 | where | |
302 | .I hhhh | |
303 | is the hexadecimal value of one of the UTF-16 code units representing the | |
304 | character. Characters from planes 1 and above are written as a pair of | |
305 | UTF-16 surrogate code units. | |
306 | .TP | |
307 | .B escape-java | |
308 | Replace the missing characters with a string of the format | |
309 | .BR \eu\fIhhhh\fP | |
310 | for plane 0 characters, and | |
311 | .BR \eu\fIhhhh\fP\eu\fIhhhh\fP | |
312 | for planes 1 and above characters, | |
313 | where | |
314 | .I hhhh | |
315 | is the hexadecimal value of one of the UTF-16 code units representing the | |
316 | character. Characters from planes 1 and above are written as a pair of | |
317 | UTF-16 surrogate code units. | |
318 | .TP | |
319 | .B escape-c | |
320 | Replace the missing characters with a string of the format | |
321 | .BR \eu\fIhhhh\fP | |
322 | for plane 0 characters, and | |
323 | .BR \eU\fIhhhhhhhh\fP | |
324 | for planes 1 and above characters, | |
325 | where | |
326 | .I hhhh | |
327 | and | |
328 | .I hhhhhhhh | |
329 | are the hexadecimal values of the Unicode codepoint. | |
330 | .TP | |
331 | .B escape-xml | |
332 | Same as | |
333 | .BR escape-xml-hex . | |
334 | .TP | |
335 | .B escape-xml-hex | |
336 | Replace the missing characters with a string of the format | |
337 | .BR &#x\fIhhhh\fP; , | |
338 | where | |
339 | .I hhhh | |
340 | is the hexadecimal value of the Unicode codepoint. | |
341 | .TP | |
342 | .B escape-xml-dec | |
343 | Replace the missing characters with a string of the format | |
344 | .BR &#x\fInnnn\fP; , | |
345 | where | |
346 | .I nnnn | |
347 | is the decimal value of the Unicode codepoint. | |
348 | .TP | |
349 | .B escape-unicode | |
350 | Replace the missing characters with a string of the format | |
351 | .BR {U+\fIhhhh\fP} , | |
352 | where | |
353 | .I hhhh | |
354 | is the hexadecimal value of the Unicode codepoint. | |
355 | That hexadecimal string is of variable length and can use from 4 to | |
356 | 6 digits. | |
357 | This is the format universally used to denote a Unicode codepoint in | |
358 | the litterature, delimited by curly braces for easy recognition of those | |
359 | substitutions in the output. | |
360 | .SH EXAMPLES | |
361 | Convert data from a given | |
362 | .I encoding | |
363 | to the platform encoding: | |
364 | ||
365 | .RS 4 | |
366 | .B \fR$ \fPuconv \-f \fIencoding\fP | |
367 | .RE | |
368 | .PP | |
369 | Check if a | |
370 | .I file | |
371 | contains valid data for a given | |
372 | .IR encoding : | |
373 | ||
374 | .RS 4 | |
375 | .B \fR$ \fPuconv \-f \fIencoding\fP \-c \fIfile\fP >/dev/null | |
376 | .RE | |
377 | .PP | |
378 | Convert a UTF-8 | |
379 | .I file | |
380 | to a given | |
381 | .I encoding | |
382 | and ensure that the resulting text is good for any version of HTML: | |
383 | ||
384 | .RS 4 | |
385 | .B \fR$ \fPuconv \-f utf-8 \-t \fIencoding\fP \e | |
386 | .br | |
387 | .B " \-\-callback escape-xml-dec \fIfile\fP" | |
388 | .RE | |
389 | .PP | |
390 | Display the names of the Unicode code points in a UTF-file: | |
391 | ||
392 | .RS 4 | |
393 | .B \fR$ \fPuconv \-f utf-8 \-x any-name \fIfile\fP | |
394 | .RE | |
395 | .PP | |
396 | Print the name of a Unicode code point whose value is known (\fBU+30AB\fP | |
397 | in this example): | |
398 | ||
399 | .RS 4 | |
400 | .B \fR$ \fPecho '\eu30ab' | uconv \-x 'hex-any; any-name'; echo | |
401 | .br | |
402 | {KATAKANA LETTER KA}{LINE FEED} | |
403 | .br | |
404 | $ | |
405 | .RE | |
406 | ||
407 | (The names are delimited by curly braces. | |
408 | Also, the name of the line terminator is also displayed.) | |
409 | .PP | |
410 | Normalize UTF-8 data using Unicode NFKC, remove all control characters, | |
411 | and map Katakana to Hiragana: | |
412 | ||
413 | .RS 4 | |
414 | .B \fR$ \fPuconv \-f utf-8 \-t utf-8 \e | |
415 | .br | |
416 | .B " \-x '::nfkc; [:Cc:] >; ::katakana-hiragana;'" | |
417 | .SH CAVEATS AND BUGS | |
418 | .B uconv | |
419 | does report errors as occuring at the first invalid byte | |
420 | encountered. This may be confusing to users of GNU | |
421 | .BR iconv (1), | |
422 | which reports errors as occuring at the first byte of an invalid | |
423 | sequence. For multi-byte character sets or encodings, this means that | |
424 | .BR uconv | |
425 | error positions may be at a later offset in the input stream than | |
426 | would be the case with GNU | |
427 | .BR iconv (1). | |
428 | .PP | |
429 | The reporting of error positions when a transliterator is used may be | |
430 | inaccurate or unavailable, in which case | |
431 | .BR uconv | |
432 | will report the offset in the output stream at which the error | |
433 | occured. | |
73c04bcf A |
434 | .\" .SH FILES |
435 | .\" .TP 15 | |
436 | .\" .B @pkgicudatadir@/@PACKAGE@/@VERSION@/uconvmsg.dat | |
437 | .\" Compiled resource bundle containing localized messages printed | |
438 | .\" by | |
439 | .\" .BR uconv . | |
b75a7d8f A |
440 | .SH AUTHORS |
441 | Jonas Utterstroem | |
442 | .br | |
443 | Yves Arrouye | |
444 | .SH VERSION | |
445 | @VERSION@ | |
446 | .SH COPYRIGHT | |
73c04bcf | 447 | Copyright (C) 2000-2005 IBM, Inc. and others. |
b75a7d8f | 448 | .SH SEE ALSO |
b75a7d8f | 449 | .BR iconv (1) |