]> git.saurik.com Git - wxWidgets.git/blob - src/common/encconv.cpp
wxHtmlFilterHTML adds fake <meta> tag so that the parser knows charset
[wxWidgets.git] / src / common / encconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: encconv.cpp
3 // Purpose: wxEncodingConverter class for converting between different
4 // font encodings
5 // Author: Vaclav Slavik
6 // Copyright: (c) 1999 Vaclav Slavik
7 // Licence: wxWindows Licence
8 /////////////////////////////////////////////////////////////////////////////
9
10 #ifdef __GNUG__
11 #pragma implementation "encconv.h"
12 #endif
13
14 // For compilers that support precompilation, includes "wx.h".
15 #include "wx/wxprec.h"
16
17 #ifdef __BORLANDC__
18 #pragma hdrstop
19 #endif
20
21 #include "wx/encconv.h"
22
23 #include <stdlib.h>
24
25 // conversion tables, generated by scripts in $(WXWIN)/misc/unictabl:
26 #ifdef __BORLANDC__
27 #include "../common/unictabl.inc"
28 #else
29 #include "unictabl.inc"
30 #endif
31
32
33 static wxUint16* LINKAGEMODE GetEncTable(wxFontEncoding enc)
34 {
35 for (int i = 0; encodings_list[i].table != NULL; i++)
36 {
37 if (encodings_list[i].encoding == enc)
38 return encodings_list[i].table;
39 }
40 return NULL;
41 }
42
43 typedef struct {
44 wxUint16 u;
45 wxUint8 c;
46 } CharsetItem;
47
48
49
50 static int LINKAGEMODE CompareCharsetItems(const void *i1, const void *i2)
51 {
52 return ( ((CharsetItem*)i1) -> u - ((CharsetItem*)i2) -> u );
53 }
54
55
56 static CharsetItem* LINKAGEMODE BuildReverseTable(wxUint16 *tbl)
57 {
58 CharsetItem *rev = new CharsetItem[128];
59
60 for (int i = 0; i < 128; i++)
61 rev[i].c = 128 + i, rev[i].u = tbl[i];
62
63 qsort(rev, 128, sizeof(CharsetItem), CompareCharsetItems);
64
65 return rev;
66 }
67
68
69
70 wxEncodingConverter::wxEncodingConverter()
71 {
72 m_Table = NULL;
73 m_UnicodeInput = m_UnicodeOutput = FALSE;
74 m_JustCopy = FALSE;
75 }
76
77
78
79 bool wxEncodingConverter::Init(wxFontEncoding input_enc, wxFontEncoding output_enc, int method)
80 {
81 unsigned i;
82 wxUint16 *in_tbl = NULL, *out_tbl = NULL;
83
84 if (m_Table) {delete[] m_Table; m_Table = NULL;}
85
86 #if !wxUSE_UNICODE
87 if (input_enc == wxFONTENCODING_UNICODE || output_enc == wxFONTENCODING_UNICODE) return FALSE;
88 #endif
89
90 if (input_enc == output_enc) {m_JustCopy = TRUE; return TRUE;}
91
92 m_UnicodeOutput = (output_enc == wxFONTENCODING_UNICODE);
93 m_JustCopy = FALSE;
94
95 if (input_enc == wxFONTENCODING_UNICODE)
96 {
97 if ((out_tbl = GetEncTable(output_enc)) == NULL) return FALSE;
98
99 m_Table = new wxChar[65536];
100 for (i = 0; i < 128; i++) m_Table[i] = (wxChar)i; // 7bit ASCII
101 for (i = 128; i < 65536; i++) m_Table[i] = (wxChar)'?';
102 // FIXME - this should be character that means `unicode to charset' impossible, not '?'
103
104 if (method == wxCONVERT_SUBSTITUTE)
105 {
106 for (i = 0; i < encoding_unicode_fallback_count; i++)
107 m_Table[encoding_unicode_fallback[i].c] = (wxChar) encoding_unicode_fallback[i].s;
108 }
109
110 for (i = 0; i < 128; i++)
111 m_Table[out_tbl[i]] = (wxChar)(128 + i);
112
113 m_UnicodeInput = TRUE;
114 return TRUE;
115 }
116
117 else
118 {
119 if ((in_tbl = GetEncTable(input_enc)) == NULL) return FALSE;
120 if (output_enc != wxFONTENCODING_UNICODE)
121 if ((out_tbl = GetEncTable(output_enc)) == NULL) return FALSE;
122
123 m_UnicodeInput = FALSE;
124
125 m_Table = new wxChar[256];
126 for (i = 0; i < 128; i++) m_Table[i] = (wxChar)i; // 7bit ASCII
127
128 if (output_enc == wxFONTENCODING_UNICODE)
129 {
130 for (i = 0; i < 128; i++) m_Table[128 + i] = (wxChar)in_tbl[i]; // wxChar is 2byte now
131 return TRUE;
132 }
133 else
134 {
135 CharsetItem *rev = BuildReverseTable(out_tbl);
136 CharsetItem *item, key;
137
138 for (i = 0; i < 128; i++)
139 {
140 key.u = in_tbl[i];
141 item = (CharsetItem*) bsearch(&key, rev, 128, sizeof(CharsetItem), CompareCharsetItems);
142 if (item == NULL && method == wxCONVERT_SUBSTITUTE)
143 item = (CharsetItem*) bsearch(&key, encoding_unicode_fallback,
144 encoding_unicode_fallback_count, sizeof(CharsetItem), CompareCharsetItems);
145 if (item)
146 m_Table[128 + i] = (wxChar)item -> c;
147 else
148 m_Table[128 + i] = 128 + i; // don't know => don't touch
149 }
150
151 delete[] rev;
152 return TRUE;
153 }
154 }
155 }
156
157
158
159 void wxEncodingConverter::Convert(const wxChar* input, wxChar* output)
160 {
161 if (m_JustCopy)
162 {
163 wxStrcpy(output, input);
164 return;
165 }
166
167 wxASSERT_MSG(m_Table != NULL, wxT("You must call wxEncodingConverter::Init() before actually converting!"));
168
169 const wxChar *i;
170 wxChar *o;
171
172 if (m_UnicodeInput)
173 for (i = input, o = output; *i != 0; i++, o++)
174 *o = (wxChar)(m_Table[(wxUint16)*i]);
175 else
176 for (i = input, o = output; *i != 0; i++, o++)
177 *o = (wxChar)(m_Table[(wxUint8)*i]);
178 *o = 0;
179 }
180
181
182 #if wxUSE_UNICODE // otherwise wxChar === char
183
184 void wxEncodingConverter::Convert(const char* input, wxChar* output)
185 {
186 wxASSERT_MSG(!m_UnicodeInput, wxT("You cannot convert from unicode if input is const char*!"));
187
188 const char *i;
189 wxChar *o;
190
191 if (m_JustCopy)
192 {
193 for (i = input, o = output; *i != 0;)
194 *(o++) = (wxChar)(*(i++));
195 *o = 0;
196 return;
197 }
198
199 wxASSERT_MSG(m_Table != NULL, wxT("You must call wxEncodingConverter::Init() before actually converting!"));
200
201 for (i = input, o = output; *i != 0;)
202 *(o++) = (wxChar)(m_Table[(wxUint8)*(i++)]);
203 *o = 0;
204 }
205
206
207
208 void wxEncodingConverter::Convert(const wxChar* input, char* output)
209 {
210 wxASSERT_MSG(!m_UnicodeOutput, wxT("You cannot convert to unicode if output is const char*!"));
211
212 const wxChar *i;
213 char *o;
214
215 if (m_JustCopy)
216 {
217 for (i = input, o = output; *i != 0;)
218 *(o++) = (char)(*(i++));
219 *o = 0;
220 return;
221 }
222
223 wxASSERT_MSG(m_Table != NULL, wxT("You must call wxEncodingConverter::Init() before actually converting!"));
224
225 if (m_UnicodeInput)
226 for (i = input, o = output; *i != 0; i++, o++)
227 *o = (char)(m_Table[(wxUint16)*i]);
228 else
229 for (i = input, o = output; *i != 0; i++, o++)
230 *o = (char)(m_Table[(wxUint8)*i]);
231 *o = 0;
232 }
233
234
235
236 void wxEncodingConverter::Convert(const char* input, char* output)
237 {
238 wxASSERT_MSG(!m_UnicodeOutput, wxT("You cannot convert to unicode if output is const char*!"));
239 wxASSERT_MSG(!m_UnicodeInput, wxT("You cannot convert from unicode if input is const char*!"));
240
241 const char *i;
242 char *o;
243
244 if (m_JustCopy)
245 {
246 strcpy(output, input);
247 return;
248 }
249
250 wxASSERT_MSG(m_Table != NULL, wxT("You must call wxEncodingConverter::Init() before actually converting!"));
251
252 for (i = input, o = output; *i != 0;)
253 *(o++) = (char)(m_Table[(wxUint8)*(i++)]);
254 *o = 0;
255 }
256
257 #endif // wxUSE_UNICODE
258
259
260 wxString wxEncodingConverter::Convert(const wxString& input)
261 {
262 if (m_JustCopy) return input;
263
264 wxString s;
265 const wxChar *i;
266
267 wxASSERT_MSG(m_Table != NULL, wxT("You must call wxEncodingConverter::Init() before actually converting!"));
268
269 if (m_UnicodeInput)
270 for (i = input.c_str(); *i != 0; i++)
271 s << (wxChar)(m_Table[(wxUint16)*i]);
272 else
273 for (i = input.c_str(); *i != 0; i++)
274 s << (wxChar)(m_Table[(wxUint8)*i]);
275 return s;
276 }
277
278
279
280
281
282
283
284 // Following tables describe classes of encoding equivalence.
285 //
286
287 #define STOP wxFONTENCODING_SYSTEM
288
289 #define NUM_OF_PLATFORMS 4 /*must conform to enum wxPLATFORM_XXXX !!!*/
290 #define ENC_PER_PLATFORM 3
291 // max no. of encodings for one language used on one platform
292 // Anybody thinks 3 is not enough? ;-)
293
294 static wxFontEncoding
295 EquivalentEncodings[][NUM_OF_PLATFORMS][ENC_PER_PLATFORM+1] = {
296
297 // *** Please put more common encodings as first! ***
298
299 // West European
300 {
301 /* unix */ {wxFONTENCODING_ISO8859_1, wxFONTENCODING_ISO8859_15, STOP},
302 /* windows */ {wxFONTENCODING_CP1252, STOP},
303 /* os2 */ {STOP},
304 /* mac */ {STOP}
305 },
306
307 // Central European
308 {
309 /* unix */ {wxFONTENCODING_ISO8859_2, STOP},
310 /* windows */ {wxFONTENCODING_CP1250, STOP},
311 /* os2 */ {STOP},
312 /* mac */ {STOP}
313 },
314
315 // Baltic
316 {
317 /* unix */ {wxFONTENCODING_ISO8859_13, STOP},
318 /* windows */ {wxFONTENCODING_CP1257, STOP},
319 /* os2 */ {STOP},
320 /* mac */ {STOP}
321 },
322
323 // Hebrew
324 {
325 /* unix */ {wxFONTENCODING_ISO8859_8, STOP},
326 /* windows */ {wxFONTENCODING_CP1255, STOP},
327 /* os2 */ {STOP},
328 /* mac */ {STOP}
329 },
330
331 // Greek
332 {
333 /* unix */ {wxFONTENCODING_ISO8859_7, STOP},
334 /* windows */ {wxFONTENCODING_CP1253, STOP},
335 /* os2 */ {STOP},
336 /* mac */ {STOP}
337 },
338
339 // Arabic
340 {
341 /* unix */ {wxFONTENCODING_ISO8859_6, STOP},
342 /* windows */ {wxFONTENCODING_CP1256, STOP},
343 /* os2 */ {STOP},
344 /* mac */ {STOP}
345 },
346
347 // Turkish
348 {
349 /* unix */ {wxFONTENCODING_ISO8859_9, STOP},
350 /* windows */ {wxFONTENCODING_CP1254, STOP},
351 /* os2 */ {STOP},
352 /* mac */ {STOP}
353 },
354
355 // Cyrillic
356 {
357 /* unix */ {wxFONTENCODING_ISO8859_13, wxFONTENCODING_ISO8859_4, STOP},
358 /* windows */ {wxFONTENCODING_CP1257, STOP},
359 /* os2 */ {STOP},
360 /* mac */ {STOP}
361 },
362
363 // Russia and other KOI-8 users:
364 {
365 /* unix */ {wxFONTENCODING_KOI8, wxFONTENCODING_ISO8859_5, STOP},
366 /* windows */ {wxFONTENCODING_CP1251, STOP},
367 /* os2 */ {STOP},
368 /* mac */ {STOP}
369 },
370
371 {{STOP},{STOP},{STOP},{STOP}} /* Terminator */
372 /* no, _not_ Arnold! */
373 };
374
375
376
377
378 wxFontEncodingArray wxEncodingConverter::GetPlatformEquivalents(wxFontEncoding enc, int platform)
379 {
380 if (platform == wxPLATFORM_CURRENT)
381 {
382 #if defined(__WXMSW__)
383 platform = wxPLATFORM_WINDOWS;
384 #elif defined(__WXGTK__) || defined(__WXMOTIF__)
385 platform = wxPLATFORM_UNIX;
386 #elif defined(__WXOS2__)
387 platform = wxPLATFORM_OS2;
388 #elif defined(__WXMAC__)
389 platform = wxPLATFORM_MAC;
390 #endif
391 }
392
393 int i, clas, e ;
394 wxFontEncoding *f;
395 wxFontEncodingArray arr;
396
397 clas = 0;
398 while (EquivalentEncodings[clas][0][0] != STOP)
399 {
400 for (i = 0; i < NUM_OF_PLATFORMS; i++)
401 for (e = 0; EquivalentEncodings[clas][i][e] != STOP; e++)
402 if (EquivalentEncodings[clas][i][e] == enc)
403 {
404 for (f = EquivalentEncodings[clas][platform]; *f != STOP; f++)
405 if (*f == enc) arr.Add(enc);
406 for (f = EquivalentEncodings[clas][platform]; *f != STOP; f++)
407 if (arr.Index(*f) == wxNOT_FOUND) arr.Add(*f);
408 i = NUM_OF_PLATFORMS/*hack*/; break;
409 }
410 clas++;
411 }
412
413 return arr;
414 }
415
416
417
418 wxFontEncodingArray wxEncodingConverter::GetAllEquivalents(wxFontEncoding enc)
419 {
420 int i, clas, e, j ;
421 wxFontEncoding *f;
422 wxFontEncodingArray arr;
423
424 arr = GetPlatformEquivalents(enc); // we want them to be first items in array
425
426 clas = 0;
427 while (EquivalentEncodings[clas][0][0] != STOP)
428 {
429 for (i = 0; i < NUM_OF_PLATFORMS; i++)
430 for (e = 0; EquivalentEncodings[clas][i][e] != STOP; e++)
431 if (EquivalentEncodings[clas][i][e] == enc)
432 {
433 for (j = 0; j < NUM_OF_PLATFORMS; j++)
434 for (f = EquivalentEncodings[clas][j]; *f != STOP; f++)
435 if (arr.Index(*f) == wxNOT_FOUND) arr.Add(*f);
436 i = NUM_OF_PLATFORMS/*hack*/; break;
437 }
438 clas++;
439 }
440
441 return arr;
442 }