X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/ea8ce907e19d774fab4661eef478e32a0b2fead3..088ddc4e370d808a2f7e923b671856c303dfa45c:/src/common/strconv.cpp diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 2603ebec34..88e49338d7 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -55,9 +55,6 @@ #include #include #include -#ifdef HAVE_LANGINFO_H - #include -#endif #if defined(__WIN32__) && !defined(__WXMICROWIN__) #define wxHAVE_WIN32_MB2WC @@ -155,7 +152,7 @@ static size_t decode_utf16(const wxUint16* input, wxUint32& output) output = *input; return 1; } - else if ((input[1]<0xdc00) || (input[1]>=0xdfff)) + else if ((input[1]<0xdc00) || (input[1]>0xdfff)) { output = *input; return (size_t)-1; @@ -281,7 +278,7 @@ const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, si //success - return actual length and the buffer *pOutSize = nActualLength; - return theBuffer; + return theBuffer; } const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const @@ -316,7 +313,7 @@ const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, //Increase the actual length (+1 for current null character) nActualLength += nLen + 1; - + //if buffer too big, realloc the buffer if (nActualLength > (nCurrentSize+1)) { @@ -343,7 +340,7 @@ const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, //success - return actual length and the buffer *pOutSize = nActualLength; - return theBuffer; + return theBuffer; } // ---------------------------------------------------------------------------- @@ -360,52 +357,41 @@ size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const return wxWC2MB(buf, psz, n); } +#ifdef __UNIX__ + // ---------------------------------------------------------------------------- -// wxConvBrokenFileNames is made for GTK2 in Unicode mode when -// files are accidentally written in an encoding which is not -// the system encoding. Typically, the system encoding will be -// UTF8 but there might be files stored in ISO8859-1 on disk. +// wxConvBrokenFileNames // ---------------------------------------------------------------------------- -class wxConvBrokenFileNames: public wxMBConvLibc -{ -public: - wxConvBrokenFileNames() : m_utf8conv(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL) { } - virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const; - virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const; - inline bool UseUTF8() const; -private: - wxMBConvUTF8 m_utf8conv; -}; - -bool wxConvBrokenFileNames::UseUTF8() const +wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset) { -#if defined HAVE_LANGINFO_H && defined CODESET - char *codeset = nl_langinfo(CODESET); - return strcmp(codeset, "UTF-8") == 0; -#else - return false; -#endif + if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0 + || wxStricmp(charset, _T("UTF8")) == 0 ) + m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL); + else + m_conv = new wxCSConv(charset); } -size_t wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const +size_t +wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf, + const char *psz, + size_t outputSize) const { - if (UseUTF8()) - return m_utf8conv.MB2WC( outputBuf, psz, outputSize ); - else - return wxMBConvLibc::MB2WC( outputBuf, psz, outputSize ); + return m_conv->MB2WC( outputBuf, psz, outputSize ); } -size_t wxConvBrokenFileNames::WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const +size_t +wxConvBrokenFileNames::WC2MB(char *outputBuf, + const wchar_t *psz, + size_t outputSize) const { - if (UseUTF8()) - return m_utf8conv.WC2MB( outputBuf, psz, outputSize ); - else - return wxMBConvLibc::WC2MB( outputBuf, psz, outputSize ); + return m_conv->WC2MB( outputBuf, psz, outputSize ); } +#endif + // ---------------------------------------------------------------------------- -// UTF-7 +// UTF-7 // ---------------------------------------------------------------------------- // Implementation (C) 2004 Fredrik Roubert @@ -558,7 +544,7 @@ size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const } #ifndef WC_UTF16 else if (((wxUint32)cc) > 0xffff) - { + { // no surrogate pair generation (yet?) return (size_t)-1; } @@ -615,6 +601,8 @@ size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const static wxUint32 utf8_max[]= { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff }; +// boundaries of the private use area we use to (temporarily) remap invalid +// characters invalid in a UTF-8 encoded string const wxUint32 wxUnicodePUA = 0x100000; const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256; @@ -636,6 +624,15 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const if (buf) *buf++ = cc; len++; + + // escape the escape character for octal escapes + if ((m_options & MAP_INVALID_UTF8_TO_OCTAL) + && cc == '\\' && (!buf || len < n)) + { + if (buf) + *buf++ = cc; + len++; + } } else { @@ -718,26 +715,23 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const #endif } } - else - if (m_options & MAP_INVALID_UTF8_TO_OCTAL) + else if (m_options & MAP_INVALID_UTF8_TO_OCTAL) { while (opsz < psz && (!buf || len < n)) { - wchar_t str[6]; - wxSnprintf( str, 5, L"\\%o", (int) (unsigned char) *opsz ); - if (buf) - *buf++ = str[0]; - if (buf) - *buf++ = str[1]; - if (buf) - *buf++ = str[2]; - if (buf) - *buf++ = str[3]; + if ( buf && len + 3 < n ) + { + unsigned char n = *opsz; + *buf++ = L'\\'; + *buf++ = (wchar_t)( L'0' + n / 0100 ); + *buf++ = (wchar_t)( L'0' + (n % 0100) / 010 ); + *buf++ = (wchar_t)( L'0' + n % 010 ); + } opsz++; len += 4; } } - else + else // MAP_INVALID_UTF8_NOT { return (size_t)-1; } @@ -749,6 +743,11 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const return len; } +static inline bool isoctal(wchar_t wch) +{ + return L'0' <= wch && wch <= L'7'; +} + size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const { size_t len = 0; @@ -763,26 +762,34 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const #else cc=(*psz++) & 0x7fffffff; #endif - if ((m_options & MAP_INVALID_UTF8_TO_PUA) - && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd) + + if ( (m_options & MAP_INVALID_UTF8_TO_PUA) + && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd ) { if (buf) *buf++ = (char)(cc - wxUnicodePUA); len++; - } - else - if ((m_options & MAP_INVALID_UTF8_TO_OCTAL) - && cc == L'\\') + } + else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) + && cc == L'\\' && psz[0] == L'\\' ) + { + if (buf) + *buf++ = (char)cc; + psz++; + len++; + } + else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) && + cc == L'\\' && + isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) ) { - wchar_t str[4]; - str[0] = *psz; psz++; - str[1] = *psz; psz++; - str[2] = *psz; psz++; - str[3] = 0; - int octal; - wxSscanf( str, L"%o", &octal ); if (buf) - *buf++ = (char) octal; + { + *buf++ = (char) ((psz[0] - L'0')*0100 + + (psz[1] - L'0')*010 + + (psz[2] - L'0')); + } + + psz += 3; len++; } else @@ -810,7 +817,8 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const } } - if (buf && (lenWC conversion would fail "randomly". wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex); #endif - + size_t inbuf = strlen(psz); size_t outbuf = n * SIZEOF_WCHAR_T; size_t res, cres; @@ -1510,7 +1518,7 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const // NB: explained in MB2WC wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex); #endif - + size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T; size_t outbuf = n; size_t res, cres; @@ -2083,9 +2091,9 @@ public: #if SIZEOF_WCHAR_T == 4 UniChar* szUniCharBuffer = new UniChar[nOutSize]; #endif - + CFStringGetCharacters(theString, theRange, szUniCharBuffer); - + CFRelease(theString); szUniCharBuffer[nOutLength] = '\0' ; @@ -2095,14 +2103,14 @@ public: converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ; delete[] szUniCharBuffer; #endif - + return nOutLength; } size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const { wxASSERT(szUnConv); - + size_t nRealOutSize; size_t nBufSize = wxWcslen(szUnConv); UniChar* szUniBuffer = (UniChar*) szUnConv; @@ -2130,7 +2138,7 @@ public: { if (szOut != NULL) CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut); - + nRealOutSize = CFStringGetLength(theString) + 1; } else @@ -2143,7 +2151,7 @@ public: //0 tells CFString to return NULL if it meets such a character false, //not an external representation (UInt8*) szOut, - nOutSize, + nOutSize, (CFIndex*) &nRealOutSize ); } @@ -2159,7 +2167,7 @@ public: bool IsOk() const { - return m_encoding != kCFStringEncodingInvalidId && + return m_encoding != kCFStringEncodingInvalidId && CFStringIsEncodingAvailable(m_encoding); } @@ -2297,7 +2305,7 @@ public: if ( buf && res < n) { buf[res] = 0; - + //we need to double-trip to verify it didn't insert any ? in place //of bogus characters wxWCharBuffer wcBuf(n); @@ -2536,7 +2544,7 @@ wxMBConv *wxCSConv::DoCreate() const #if defined(__WXMAC__) { // leave UTF16 and UTF32 to the built-ins of wx - if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE || + if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE || ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) ) { @@ -2731,7 +2739,6 @@ static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM); static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1); static wxMBConvUTF7 wxConvUTF7Obj; static wxMBConvUTF8 wxConvUTF8Obj; -static wxConvBrokenFileNames wxConvBrokenFileNamesObj; WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj; WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj; @@ -2742,8 +2749,6 @@ WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj; WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = & #ifdef __WXOSX__ wxConvUTF8Obj; -#elif __WXGTK20__ - wxConvBrokenFileNamesObj; #else wxConvLibcObj; #endif