mirror of https://github.com/qt/qt5compat.git
1233 lines
34 KiB
C++
1233 lines
34 KiB
C++
// Copyright (C) 2018 The Qt Company Ltd.
|
|
// Copyright (C) 2018 Intel Corporation.
|
|
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
|
|
// Qt-Security score:critical reason:data-parser
|
|
|
|
#include "qplatformdefs.h"
|
|
|
|
#include "qtextcodec.h"
|
|
#include "qtextcodec_p.h"
|
|
|
|
#include "qbytearraymatcher.h"
|
|
#include "qendian.h"
|
|
#include "qfile.h"
|
|
#include "qlist.h"
|
|
#include <private/qlocking_p.h>
|
|
#include "qstringlist.h"
|
|
#include "qvarlengtharray.h"
|
|
|
|
#include <private/qcoreapplication_p.h>
|
|
|
|
#include "qutfcodec_p.h"
|
|
#include "qlatincodec_p.h"
|
|
|
|
#if QT_CONFIG(codecs)
|
|
# include "qtsciicodec_p.h"
|
|
# include "qisciicodec_p.h"
|
|
#endif
|
|
#if QT_CONFIG(icu)
|
|
#include "qicucodec_p.h"
|
|
#else
|
|
#if QT_CONFIG(iconv)
|
|
# include "qiconvcodec_p.h"
|
|
#endif
|
|
#ifdef Q_OS_WIN
|
|
# include "qwindowscodec_p.h"
|
|
#endif
|
|
# include "qsimplecodec_p.h"
|
|
#if QT_CONFIG(big_codecs) && QT_CONFIG(textcodec)
|
|
# ifndef Q_OS_INTEGRITY
|
|
# include "qgb18030codec_p.h"
|
|
# include "qeucjpcodec_p.h"
|
|
# include "qjiscodec_p.h"
|
|
# include "qsjiscodec_p.h"
|
|
# include "qeuckrcodec_p.h"
|
|
# include "qbig5codec_p.h"
|
|
# endif // !Q_OS_INTEGRITY
|
|
#endif // big_codecs
|
|
|
|
#endif // icu
|
|
|
|
#include <mutex>
|
|
|
|
#include <stdlib.h>
|
|
#include <ctype.h>
|
|
#include <locale.h>
|
|
#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_ANDROID)
|
|
# include <langinfo.h>
|
|
#endif
|
|
|
|
QT_BEGIN_NAMESPACE
|
|
|
|
// in qstring.cpp:
|
|
void qt_from_latin1(char16_t *dst, const char *str, size_t size) noexcept;
|
|
|
|
typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt;
|
|
typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt;
|
|
|
|
Q_GLOBAL_STATIC(QRecursiveMutex, textCodecsMutex);
|
|
|
|
Q_GLOBAL_STATIC(QTextCodecData, textCodecData)
|
|
|
|
QTextCodecData::QTextCodecData()
|
|
: codecForLocale(nullptr)
|
|
{
|
|
}
|
|
|
|
QTextCodecData::~QTextCodecData()
|
|
{
|
|
codecForLocale.storeRelease(nullptr);
|
|
QList<QTextCodec *> tmp = allCodecs;
|
|
allCodecs.clear();
|
|
codecCache.clear();
|
|
for (QList<QTextCodec *>::const_iterator it = tmp.constBegin(); it != tmp.constEnd(); ++it)
|
|
delete *it;
|
|
}
|
|
|
|
QTextCodecData *QTextCodecData::instance()
|
|
{
|
|
return textCodecData();
|
|
}
|
|
|
|
class TextCodecsMutexLocker
|
|
{
|
|
using Lock = decltype(qt_unique_lock(std::declval<QRecursiveMutex&>()));
|
|
// ### FIXME: this is used when textCodecsMutex already == nullptr
|
|
const Lock lock = qt_unique_lock(textCodecsMutex());
|
|
public:
|
|
TextCodecsMutexLocker() {} // required d/t an ICC 19 bug
|
|
};
|
|
|
|
#if !QT_CONFIG(icu)
|
|
static char qtolower(char c)
|
|
{ if (c >= 'A' && c <= 'Z') return c + 0x20; return c; }
|
|
static bool qisalnum(char c)
|
|
{ return (c >= '0' && c <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'z'); }
|
|
|
|
bool qTextCodecNameMatch(const char *n, const char *h)
|
|
{
|
|
if (qstricmp(n, h) == 0)
|
|
return true;
|
|
|
|
// if the letters and numbers are the same, we have a match
|
|
while (*n != '\0') {
|
|
if (qisalnum(*n)) {
|
|
for (;;) {
|
|
if (*h == '\0')
|
|
return false;
|
|
if (qisalnum(*h))
|
|
break;
|
|
++h;
|
|
}
|
|
if (qtolower(*n) != qtolower(*h))
|
|
return false;
|
|
++h;
|
|
}
|
|
++n;
|
|
}
|
|
while (*h && !qisalnum(*h))
|
|
++h;
|
|
return (*h == '\0');
|
|
}
|
|
|
|
|
|
#if !defined(Q_OS_WIN32) && !defined(QT_LOCALE_IS_UTF8)
|
|
static QTextCodec *checkForCodec(const QByteArray &name) {
|
|
QTextCodec *c = QTextCodec::codecForName(name);
|
|
if (!c) {
|
|
const int index = name.indexOf('@');
|
|
if (index != -1) {
|
|
c = QTextCodec::codecForName(name.left(index));
|
|
}
|
|
}
|
|
return c;
|
|
}
|
|
#endif
|
|
|
|
static void setup();
|
|
|
|
// \threadsafe
|
|
// this returns the codec the method sets up as locale codec to
|
|
// avoid a race condition in codecForLocale() when
|
|
// setCodecForLocale(nullptr) is called at the same time.
|
|
static QTextCodec *setupLocaleMapper()
|
|
{
|
|
QTextCodecData *globalData = QTextCodecData::instance();
|
|
|
|
QTextCodec *locale = nullptr;
|
|
|
|
{
|
|
const TextCodecsMutexLocker locker;
|
|
if (globalData->allCodecs.isEmpty())
|
|
setup();
|
|
}
|
|
|
|
QCoreApplicationPrivate::initLocale();
|
|
|
|
#if defined(QT_LOCALE_IS_UTF8)
|
|
locale = QTextCodec::codecForName("UTF-8");
|
|
#elif defined(Q_OS_WIN)
|
|
locale = QTextCodec::codecForName("System");
|
|
#else
|
|
|
|
// First try getting the codecs name from nl_langinfo and see
|
|
// if we have a builtin codec for it.
|
|
// Only fall back to using iconv if we can't find a builtin codec
|
|
// This is because the builtin utf8 codec is around 5 times faster
|
|
// then the using QIconvCodec
|
|
|
|
#if defined (_XOPEN_UNIX)
|
|
char *charset = nl_langinfo(CODESET);
|
|
if (charset)
|
|
locale = QTextCodec::codecForName(charset);
|
|
#endif
|
|
#if QT_CONFIG(iconv)
|
|
if (!locale) {
|
|
// no builtin codec for the locale found, let's try using iconv
|
|
(void) new QIconvCodec();
|
|
locale = QTextCodec::codecForName("System");
|
|
}
|
|
#endif
|
|
|
|
if (!locale) {
|
|
// Very poorly defined and followed standards causes lots of
|
|
// code to try to get all the cases... This logic is
|
|
// duplicated in QIconvCodec, so if you change it here, change
|
|
// it there too.
|
|
|
|
// Try to determine locale codeset from locale name assigned to
|
|
// LC_CTYPE category.
|
|
|
|
// First part is getting that locale name. First try setlocale() which
|
|
// definitely knows it, but since we cannot fully trust it, get ready
|
|
// to fall back to environment variables.
|
|
const QByteArray ctype = setlocale(LC_CTYPE, nullptr);
|
|
|
|
// Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
|
|
// environment variables.
|
|
QByteArray lang = qgetenv("LC_ALL");
|
|
if (lang.isEmpty() || lang == "C") {
|
|
lang = qgetenv("LC_CTYPE");
|
|
}
|
|
if (lang.isEmpty() || lang == "C") {
|
|
lang = qgetenv("LANG");
|
|
}
|
|
|
|
// Now try these in order:
|
|
// 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
|
|
// 2. CODESET from lang if it contains a .CODESET part
|
|
// 3. ctype (maybe the locale is named "ISO-8859-1" or something)
|
|
// 4. locale (ditto)
|
|
// 5. check for "@euro"
|
|
// 6. guess locale from ctype unless ctype is "C"
|
|
// 7. guess locale from lang
|
|
|
|
// 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
|
|
int indexOfDot = ctype.indexOf('.');
|
|
if (indexOfDot != -1)
|
|
locale = checkForCodec( ctype.mid(indexOfDot + 1) );
|
|
|
|
// 2. CODESET from lang if it contains a .CODESET part
|
|
if (!locale) {
|
|
indexOfDot = lang.indexOf('.');
|
|
if (indexOfDot != -1)
|
|
locale = checkForCodec( lang.mid(indexOfDot + 1) );
|
|
}
|
|
|
|
// 3. ctype (maybe the locale is named "ISO-8859-1" or something)
|
|
if (!locale && !ctype.isEmpty() && ctype != "C")
|
|
locale = checkForCodec(ctype);
|
|
|
|
// 4. locale (ditto)
|
|
if (!locale && !lang.isEmpty())
|
|
locale = checkForCodec(lang);
|
|
|
|
// 5. "@euro"
|
|
if ((!locale && ctype.contains("@euro")) || lang.contains("@euro"))
|
|
locale = checkForCodec("ISO 8859-15");
|
|
}
|
|
|
|
#endif
|
|
// If everything failed, we default to 8859-1
|
|
if (!locale)
|
|
locale = QTextCodec::codecForName("ISO 8859-1");
|
|
globalData->codecForLocale.storeRelease(locale);
|
|
return locale;
|
|
}
|
|
|
|
|
|
// textCodecsMutex need to be locked to enter this function
|
|
static void setup()
|
|
{
|
|
static bool initialized = false;
|
|
if (initialized)
|
|
return;
|
|
initialized = true;
|
|
|
|
#if QT_CONFIG(codecs)
|
|
(void)new QTsciiCodec;
|
|
for (int i = 0; i < 9; ++i)
|
|
(void)new QIsciiCodec(i);
|
|
for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i)
|
|
(void)new QSimpleTextCodec(i);
|
|
|
|
# if QT_CONFIG(big_codecs) && !defined(Q_OS_INTEGRITY)
|
|
(void)new QGb18030Codec;
|
|
(void)new QGbkCodec;
|
|
(void)new QGb2312Codec;
|
|
(void)new QEucJpCodec;
|
|
(void)new QJisCodec;
|
|
(void)new QSjisCodec;
|
|
(void)new QEucKrCodec;
|
|
(void)new QCP949Codec;
|
|
(void)new QBig5Codec;
|
|
(void)new QBig5hkscsCodec;
|
|
# endif // big_codecs && !Q_OS_INTEGRITY
|
|
#if QT_CONFIG(iconv)
|
|
(void) new QIconvCodec;
|
|
#endif
|
|
#if defined(Q_OS_WIN32)
|
|
(void) new QWindowsLocalCodec;
|
|
#endif // Q_OS_WIN32
|
|
#endif // codecs
|
|
|
|
(void)new QUtf16Codec;
|
|
(void)new QUtf16BECodec;
|
|
(void)new QUtf16LECodec;
|
|
(void)new QUtf32Codec;
|
|
(void)new QUtf32BECodec;
|
|
(void)new QUtf32LECodec;
|
|
(void)new QLatin15Codec;
|
|
(void)new QLatin1Codec;
|
|
(void)new QUtf8Codec;
|
|
}
|
|
#else
|
|
static void setup() {}
|
|
#endif // icu
|
|
|
|
/*!
|
|
\class QTextCodec
|
|
\inmodule QtCore5Compat
|
|
\brief The QTextCodec class provides conversions between text encodings.
|
|
\reentrant
|
|
\ingroup i18n
|
|
|
|
Qt uses Unicode to store, draw and manipulate strings. In many
|
|
situations you may wish to deal with data that uses a different
|
|
encoding. For example, most Japanese documents are still stored
|
|
in Shift-JIS or ISO 2022-JP, while Russian users often have their
|
|
documents in KOI8-R or Windows-1251.
|
|
|
|
Qt provides a set of QTextCodec classes to help with converting
|
|
non-Unicode formats to and from Unicode. You can also create your
|
|
own codec classes.
|
|
|
|
The supported encodings are:
|
|
|
|
\list
|
|
\li \l{Big5 Text Codec}{Big5}
|
|
\li \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
|
|
\li CP949
|
|
\li \l{EUC-JP Text Codec}{EUC-JP}
|
|
\li \l{EUC-KR Text Codec}{EUC-KR}
|
|
\li \l{GBK Text Codec}{GB18030}
|
|
\li HP-ROMAN8
|
|
\li IBM 850
|
|
\li IBM 866
|
|
\li IBM 874
|
|
\li \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
|
|
\li ISO 8859-1 to 10
|
|
\li ISO 8859-13 to 16
|
|
\li Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
|
|
\li KOI8-R
|
|
\li KOI8-U
|
|
\li Macintosh
|
|
\li \l{Shift-JIS Text Codec}{Shift-JIS}
|
|
\li TIS-620
|
|
\li \l{TSCII Text Codec}{TSCII}
|
|
\li UTF-8
|
|
\li UTF-16
|
|
\li UTF-16BE
|
|
\li UTF-16LE
|
|
\li UTF-32
|
|
\li UTF-32BE
|
|
\li UTF-32LE
|
|
\li Windows-1250 to 1258
|
|
\endlist
|
|
|
|
If Qt is compiled with ICU support enabled, most codecs supported by
|
|
ICU will also be available to the application.
|
|
|
|
\l {QTextCodec}s can be used as follows to convert some locally encoded
|
|
string to Unicode. Suppose you have some string encoded in Russian
|
|
KOI8-R encoding, and want to convert it to Unicode. The simple way
|
|
to do it is like this:
|
|
|
|
\snippet code/src_corelib_codecs_qtextcodec.cpp 0
|
|
|
|
After this, \c string holds the text converted to Unicode.
|
|
Converting a string from Unicode to the local encoding is just as
|
|
easy:
|
|
|
|
\snippet code/src_corelib_codecs_qtextcodec.cpp 1
|
|
|
|
Some care must be taken when trying to convert the data in chunks,
|
|
for example, when receiving it over a network. In such cases it is
|
|
possible that a multi-byte character will be split over two
|
|
chunks. At best this might result in the loss of a character and
|
|
at worst cause the entire conversion to fail.
|
|
|
|
The approach to use in these situations is to create a QTextDecoder
|
|
object for the codec and use this QTextDecoder for the whole
|
|
decoding process, as shown below:
|
|
|
|
\snippet code/src_corelib_codecs_qtextcodec.cpp 2
|
|
|
|
The QTextDecoder object maintains state between chunks and therefore
|
|
works correctly even if a multi-byte character is split between
|
|
chunks.
|
|
|
|
\section1 Creating Your Own Codec Class
|
|
|
|
Support for new text encodings can be added to Qt by creating
|
|
QTextCodec subclasses.
|
|
|
|
The pure virtual functions describe the encoder to the system and
|
|
the coder is used as required in the different text file formats
|
|
supported by QTextStream, and under X11, for the locale-specific
|
|
character input and output.
|
|
|
|
To add support for another encoding to Qt, make a subclass of
|
|
QTextCodec and implement the functions listed in the table below.
|
|
|
|
\table
|
|
\header \li Function \li Description
|
|
|
|
\row \li name()
|
|
\li Returns the official name for the encoding. If the
|
|
encoding is listed in the
|
|
\l{IANA character-sets encoding file}, the name
|
|
should be the preferred MIME name for the encoding.
|
|
|
|
\row \li aliases()
|
|
\li Returns a list of alternative names for the encoding.
|
|
QTextCodec provides a default implementation that returns
|
|
an empty list. For example, "ISO-8859-1" has "latin1",
|
|
"CP819", "IBM819", and "iso-ir-100" as aliases.
|
|
|
|
\row \li \l{QTextCodec::mibEnum()}{mibEnum()}
|
|
\li Return the MIB enum for the encoding if it is listed in
|
|
the \l{IANA character-sets encoding file}.
|
|
|
|
\row \li convertToUnicode()
|
|
\li Converts an 8-bit character string to Unicode.
|
|
|
|
\row \li convertFromUnicode()
|
|
\li Converts a Unicode string to an 8-bit character string.
|
|
\endtable
|
|
|
|
\sa QTextStream, QTextDecoder, QTextEncoder
|
|
*/
|
|
|
|
/*!
|
|
Constructs a QTextCodec, and gives it the highest precedence. The
|
|
QTextCodec should always be constructed on the heap (i.e. with \c
|
|
new). Qt takes ownership and will delete it when the application
|
|
terminates.
|
|
*/
|
|
QTextCodec::QTextCodec()
|
|
{
|
|
const TextCodecsMutexLocker locker;
|
|
|
|
QTextCodecData *globalInstance = QTextCodecData::instance();
|
|
if (globalInstance->allCodecs.isEmpty())
|
|
setup();
|
|
|
|
globalInstance->allCodecs.prepend(this);
|
|
}
|
|
|
|
|
|
/*!
|
|
\nonreentrant
|
|
|
|
Destroys the QTextCodec. Note that you should not delete codecs
|
|
yourself: once created they become Qt's responsibility.
|
|
*/
|
|
QTextCodec::~QTextCodec()
|
|
{
|
|
QTextCodecData *globalData = QTextCodecData::instance();
|
|
if (!globalData)
|
|
return;
|
|
|
|
globalData->codecForLocale.testAndSetRelaxed(this, nullptr);
|
|
|
|
const TextCodecsMutexLocker locker;
|
|
|
|
globalData->allCodecs.removeOne(this);
|
|
|
|
auto it = globalData->codecCache.begin();
|
|
|
|
while (it != globalData->codecCache.end()) {
|
|
if (it.value() == this)
|
|
it = globalData->codecCache.erase(it);
|
|
else
|
|
++it;
|
|
}
|
|
}
|
|
|
|
/*!
|
|
\fn QTextCodec *QTextCodec::codecForName(const char *name)
|
|
|
|
Searches all installed QTextCodec objects and returns the one
|
|
which best matches \a name; the match is case-insensitive. Returns
|
|
\nullptr if no codec matching the name \a name could be found.
|
|
*/
|
|
|
|
/*!
|
|
\threadsafe
|
|
Searches all installed QTextCodec objects and returns the one
|
|
which best matches \a name; the match is case-insensitive. Returns
|
|
\nullptr if no codec matching the name \a name could be found.
|
|
*/
|
|
QTextCodec *QTextCodec::codecForName(const QByteArray &name)
|
|
{
|
|
if (name.isEmpty())
|
|
return nullptr;
|
|
|
|
const TextCodecsMutexLocker locker;
|
|
|
|
QTextCodecData *globalData = QTextCodecData::instance();
|
|
if (!globalData)
|
|
return nullptr;
|
|
setup();
|
|
|
|
#if !QT_CONFIG(icu)
|
|
QTextCodecCache *cache = &globalData->codecCache;
|
|
QTextCodec *codec;
|
|
codec = cache->value(name);
|
|
if (codec)
|
|
return codec;
|
|
|
|
for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
|
|
QTextCodec *cursor = *it;
|
|
if (qTextCodecNameMatch(cursor->name(), name)) {
|
|
if (cache)
|
|
cache->insert(name, cursor);
|
|
return cursor;
|
|
}
|
|
QList<QByteArray> aliases = cursor->aliases();
|
|
for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) {
|
|
if (qTextCodecNameMatch(*ait, name)) {
|
|
cache->insert(name, cursor);
|
|
return cursor;
|
|
}
|
|
}
|
|
}
|
|
|
|
return nullptr;
|
|
#else
|
|
return QIcuCodec::codecForNameUnlocked(name);
|
|
#endif
|
|
}
|
|
|
|
|
|
/*!
|
|
\threadsafe
|
|
Returns the QTextCodec which matches the
|
|
\l{QTextCodec::mibEnum()}{MIBenum} \a mib.
|
|
*/
|
|
QTextCodec* QTextCodec::codecForMib(int mib)
|
|
{
|
|
const TextCodecsMutexLocker locker;
|
|
|
|
QTextCodecData *globalData = QTextCodecData::instance();
|
|
if (!globalData)
|
|
return nullptr;
|
|
if (globalData->allCodecs.isEmpty())
|
|
setup();
|
|
|
|
QByteArray key = "MIB: " + QByteArray::number(mib);
|
|
|
|
QTextCodecCache *cache = &globalData->codecCache;
|
|
QTextCodec *codec;
|
|
if (cache) {
|
|
codec = cache->value(key);
|
|
if (codec)
|
|
return codec;
|
|
}
|
|
|
|
for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
|
|
QTextCodec *cursor = *it;
|
|
if (cursor->mibEnum() == mib) {
|
|
if (cache)
|
|
cache->insert(key, cursor);
|
|
return cursor;
|
|
}
|
|
}
|
|
|
|
#if QT_CONFIG(icu)
|
|
return QIcuCodec::codecForMibUnlocked(mib);
|
|
#else
|
|
return nullptr;
|
|
#endif
|
|
}
|
|
|
|
/*!
|
|
\threadsafe
|
|
Returns the list of all available codecs, by name. Call
|
|
QTextCodec::codecForName() to obtain the QTextCodec for the name.
|
|
|
|
The list may contain many mentions of the same codec
|
|
if the codec has aliases.
|
|
|
|
\sa availableMibs(), name(), aliases()
|
|
*/
|
|
QList<QByteArray> QTextCodec::availableCodecs()
|
|
{
|
|
const TextCodecsMutexLocker locker;
|
|
|
|
QTextCodecData *globalData = QTextCodecData::instance();
|
|
if (globalData->allCodecs.isEmpty())
|
|
setup();
|
|
|
|
QList<QByteArray> codecs;
|
|
|
|
for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
|
|
codecs += (*it)->name();
|
|
codecs += (*it)->aliases();
|
|
}
|
|
|
|
#if QT_CONFIG(icu)
|
|
codecs += QIcuCodec::availableCodecs();
|
|
#endif
|
|
|
|
return codecs;
|
|
}
|
|
|
|
/*!
|
|
\threadsafe
|
|
Returns the list of MIBs for all available codecs. Call
|
|
QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
|
|
|
|
\sa availableCodecs(), mibEnum()
|
|
*/
|
|
QList<int> QTextCodec::availableMibs()
|
|
{
|
|
#if QT_CONFIG(icu)
|
|
return QIcuCodec::availableMibs();
|
|
#else
|
|
const TextCodecsMutexLocker locker;
|
|
|
|
QTextCodecData *globalData = QTextCodecData::instance();
|
|
if (globalData->allCodecs.isEmpty())
|
|
setup();
|
|
|
|
QList<int> codecs;
|
|
|
|
for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it)
|
|
codecs += (*it)->mibEnum();
|
|
|
|
return codecs;
|
|
#endif
|
|
}
|
|
|
|
/*!
|
|
\nonreentrant
|
|
|
|
Set the codec to \a c; this will be returned by
|
|
codecForLocale(). If \a c is \nullptr, the codec is reset to
|
|
the default.
|
|
|
|
This might be needed for some applications that want to use their
|
|
own mechanism for setting the locale.
|
|
|
|
\sa codecForLocale()
|
|
*/
|
|
void QTextCodec::setCodecForLocale(QTextCodec *c)
|
|
{
|
|
QTextCodecData::instance()->codecForLocale.storeRelease(c);
|
|
}
|
|
|
|
/*!
|
|
\threadsafe
|
|
Returns a pointer to the codec most suitable for this locale.
|
|
|
|
The codec will be retrieved from ICU where that backend is in use, otherwise
|
|
it may be obtained from an OS-specific API. In the latter case, the codec's
|
|
name may be "System".
|
|
*/
|
|
|
|
QTextCodec* QTextCodec::codecForLocale()
|
|
{
|
|
QTextCodecData *globalData = QTextCodecData::instance();
|
|
if (!globalData)
|
|
return nullptr;
|
|
|
|
QTextCodec *codec = globalData->codecForLocale.loadAcquire();
|
|
if (!codec) {
|
|
#if QT_CONFIG(icu)
|
|
const TextCodecsMutexLocker locker;
|
|
codec = QIcuCodec::defaultCodecUnlocked();
|
|
#else
|
|
// setupLocaleMapper locks as necessary
|
|
codec = setupLocaleMapper();
|
|
#endif
|
|
}
|
|
|
|
return codec;
|
|
}
|
|
|
|
|
|
/*!
|
|
\fn QByteArray QTextCodec::name() const
|
|
|
|
QTextCodec subclasses must reimplement this function. It returns
|
|
the name of the encoding supported by the subclass.
|
|
|
|
If the codec is registered as a character set in the
|
|
\l{IANA character-sets encoding file} this method should
|
|
return the preferred mime name for the codec if defined,
|
|
otherwise its name.
|
|
*/
|
|
|
|
/*!
|
|
\fn int QTextCodec::mibEnum() const
|
|
|
|
Subclasses of QTextCodec must reimplement this function. It
|
|
returns the \l{QTextCodec::mibEnum()}{MIBenum} (see \l{IANA character-sets encoding file}
|
|
for more information). It is important that each QTextCodec
|
|
subclass returns the correct unique value for this function.
|
|
*/
|
|
|
|
/*!
|
|
Subclasses can return a number of aliases for the codec in question.
|
|
|
|
Standard aliases for codecs can be found in the
|
|
\l{IANA character-sets encoding file}.
|
|
*/
|
|
QList<QByteArray> QTextCodec::aliases() const
|
|
{
|
|
return QList<QByteArray>();
|
|
}
|
|
|
|
/*!
|
|
\fn QString QTextCodec::convertToUnicode(const char *chars, int len,
|
|
ConverterState *state) const
|
|
|
|
QTextCodec subclasses must reimplement this function.
|
|
|
|
Converts the first \a len characters of \a chars from the
|
|
encoding of the subclass to Unicode, and returns the result in a
|
|
QString.
|
|
|
|
\a state can be \nullptr, in which case the conversion is stateless and
|
|
default conversion rules should be used. If \a state is not \nullptr, the
|
|
codec should save the state after the conversion in \a state, and
|
|
adjust the \c remainingChars and \c invalidChars members of the struct.
|
|
*/
|
|
|
|
/*!
|
|
\fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number,
|
|
ConverterState *state) const
|
|
|
|
QTextCodec subclasses must reimplement this function.
|
|
|
|
Converts the first \a number of characters from the \a input array
|
|
from Unicode to the encoding of the subclass, and returns the result
|
|
in a QByteArray.
|
|
|
|
\a state can be \nullptr in which case the conversion is stateless and
|
|
default conversion rules should be used. If \a state is not \nullptr, the
|
|
codec should save the state after the conversion in \a state, and
|
|
adjust the \c remainingChars and \c invalidChars members of the struct.
|
|
*/
|
|
|
|
/*!
|
|
Creates a QTextDecoder with a specified \a flags to decode chunks
|
|
of \c{char *} data to create chunks of Unicode data.
|
|
|
|
The caller is responsible for deleting the returned object.
|
|
|
|
\since 4.7
|
|
*/
|
|
QTextDecoder* QTextCodec::makeDecoder(QTextCodec::ConversionFlags flags) const
|
|
{
|
|
return new QTextDecoder(this, flags);
|
|
}
|
|
|
|
/*!
|
|
Creates a QTextEncoder with a specified \a flags to encode chunks
|
|
of Unicode data as \c{char *} data.
|
|
|
|
The caller is responsible for deleting the returned object.
|
|
|
|
\since 4.7
|
|
*/
|
|
QTextEncoder* QTextCodec::makeEncoder(QTextCodec::ConversionFlags flags) const
|
|
{
|
|
return new QTextEncoder(this, flags);
|
|
}
|
|
|
|
/*!
|
|
\fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number,
|
|
ConverterState *state) const
|
|
|
|
Converts the first \a number of characters from the \a input array
|
|
from Unicode to the encoding of this codec, and returns the result
|
|
in a QByteArray.
|
|
|
|
The \a state of the convertor used is updated.
|
|
*/
|
|
|
|
/*!
|
|
Converts \a str from Unicode to the encoding of this codec, and
|
|
returns the result in a QByteArray.
|
|
*/
|
|
QByteArray QTextCodec::fromUnicode(const QString& str) const
|
|
{
|
|
ConverterState state = DefaultConversion | Flag::Stateless;
|
|
return convertFromUnicode(str.constData(), str.size(), &state);
|
|
}
|
|
|
|
/*!
|
|
\overload
|
|
\since 5.10
|
|
|
|
Converts \a str from Unicode to the encoding of this codec, and
|
|
returns the result in a QByteArray.
|
|
*/
|
|
QByteArray QTextCodec::fromUnicode(QStringView str) const
|
|
{
|
|
ConverterState state = DefaultConversion | Flag::Stateless;
|
|
return convertFromUnicode(str.data(), str.size(), &state);
|
|
}
|
|
|
|
/*!
|
|
\fn QString QTextCodec::toUnicode(const char *input, int size,
|
|
ConverterState *state) const
|
|
|
|
Converts the first \a size characters from the \a input from the
|
|
encoding of this codec to Unicode, and returns the result in a
|
|
QString.
|
|
|
|
The \a state of the convertor used is updated.
|
|
*/
|
|
|
|
/*!
|
|
Converts \a a from the encoding of this codec to Unicode, and
|
|
returns the result in a QString.
|
|
*/
|
|
QString QTextCodec::toUnicode(const QByteArray& a) const
|
|
{
|
|
ConverterState state = DefaultConversion | Flag::Stateless;
|
|
return convertToUnicode(a.constData(), a.size(), &state);
|
|
}
|
|
|
|
/*!
|
|
Returns \c true if the Unicode character \a ch can be fully encoded
|
|
with this codec; otherwise returns \c false.
|
|
*/
|
|
bool QTextCodec::canEncode(QChar ch) const
|
|
{
|
|
ConverterState state;
|
|
state.flags = ConvertInvalidToNull;
|
|
convertFromUnicode(&ch, 1, &state);
|
|
return (state.invalidChars == 0);
|
|
}
|
|
|
|
/*!
|
|
\overload
|
|
|
|
\a s contains the string being tested for encode-ability.
|
|
*/
|
|
bool QTextCodec::canEncode(const QString& s) const
|
|
{
|
|
ConverterState state;
|
|
state.flags = ConvertInvalidToNull;
|
|
convertFromUnicode(s.constData(), s.size(), &state);
|
|
return (state.invalidChars == 0);
|
|
}
|
|
|
|
/*!
|
|
\overload
|
|
\since 5.10
|
|
|
|
Returns \c true if the Unicode string \a s can be fully encoded
|
|
with this codec; otherwise returns \c false.
|
|
*/
|
|
bool QTextCodec::canEncode(QStringView s) const
|
|
{
|
|
ConverterState state;
|
|
state.flags = ConvertInvalidToNull;
|
|
convertFromUnicode(s.data(), s.size(), &state);
|
|
return !state.invalidChars;
|
|
}
|
|
/*!
|
|
\overload
|
|
|
|
\a chars contains the source characters.
|
|
*/
|
|
QString QTextCodec::toUnicode(const char *chars) const
|
|
{
|
|
const auto len = int(qstrlen(chars));
|
|
return convertToUnicode(chars, len, nullptr);
|
|
}
|
|
|
|
|
|
/*!
|
|
\class QTextEncoder
|
|
\inmodule QtCore5Compat
|
|
\brief The QTextEncoder class provides a state-based encoder.
|
|
\reentrant
|
|
\ingroup i18n
|
|
|
|
A text encoder converts text from Unicode into an encoded text format
|
|
using a specific codec.
|
|
|
|
The encoder converts Unicode into another format, remembering any
|
|
state that is required between calls.
|
|
|
|
\sa QTextCodec::makeEncoder(), QTextDecoder
|
|
*/
|
|
|
|
/*!
|
|
\fn QTextEncoder::QTextEncoder(const QTextCodec *codec)
|
|
|
|
Constructs a text encoder for the given \a codec.
|
|
*/
|
|
|
|
/*!
|
|
Constructs a text encoder for the given \a codec and conversion \a flags.
|
|
|
|
\since 4.7
|
|
*/
|
|
QTextEncoder::QTextEncoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
|
|
: c(codec), state()
|
|
{
|
|
state.flags = flags;
|
|
}
|
|
|
|
/*!
|
|
Destroys the encoder.
|
|
*/
|
|
QTextEncoder::~QTextEncoder()
|
|
{
|
|
}
|
|
|
|
/*!
|
|
\internal
|
|
\since 4.5
|
|
Determines whether the encoder encountered a failure while decoding the input. If
|
|
an error was encountered, the produced result is undefined, and gets converted as according
|
|
to the conversion flags.
|
|
*/
|
|
bool QTextEncoder::hasFailure() const
|
|
{
|
|
return state.invalidChars != 0;
|
|
}
|
|
|
|
/*!
|
|
Converts the Unicode string \a str into an encoded QByteArray.
|
|
*/
|
|
QByteArray QTextEncoder::fromUnicode(const QString& str)
|
|
{
|
|
return c->fromUnicode(str.constData(), str.size(), &state);
|
|
}
|
|
|
|
/*!
|
|
\overload
|
|
\since 5.10
|
|
Converts the Unicode string \a str into an encoded QByteArray.
|
|
*/
|
|
QByteArray QTextEncoder::fromUnicode(QStringView str)
|
|
{
|
|
return c->fromUnicode(str.data(), str.size(), &state);
|
|
}
|
|
|
|
/*!
|
|
\overload
|
|
|
|
Converts \a len characters (not bytes) from \a uc, and returns the
|
|
result in a QByteArray.
|
|
*/
|
|
QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len)
|
|
{
|
|
return c->fromUnicode(uc, len, &state);
|
|
}
|
|
|
|
/*!
|
|
\class QTextDecoder
|
|
\inmodule QtCore5Compat
|
|
\brief The QTextDecoder class provides a state-based decoder.
|
|
\reentrant
|
|
\ingroup i18n
|
|
|
|
A text decoder converts text from an encoded text format into Unicode
|
|
using a specific codec.
|
|
|
|
The decoder converts text in this format into Unicode, remembering any
|
|
state that is required between calls.
|
|
|
|
\sa QTextCodec::makeDecoder(), QTextEncoder
|
|
*/
|
|
|
|
/*!
|
|
\fn QTextDecoder::QTextDecoder(const QTextCodec *codec)
|
|
|
|
Constructs a text decoder for the given \a codec.
|
|
*/
|
|
|
|
/*!
|
|
Constructs a text decoder for the given \a codec and conversion \a flags.
|
|
|
|
\since 4.7
|
|
*/
|
|
|
|
QTextDecoder::QTextDecoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
|
|
: c(codec), state()
|
|
{
|
|
state.flags = flags;
|
|
}
|
|
|
|
/*!
|
|
Destroys the decoder.
|
|
*/
|
|
QTextDecoder::~QTextDecoder()
|
|
{
|
|
}
|
|
|
|
/*!
|
|
\fn QString QTextDecoder::toUnicode(const char *chars, int len)
|
|
|
|
Converts the first \a len bytes in \a chars to Unicode, returning
|
|
the result.
|
|
|
|
If not all characters are used (e.g. if only part of a multi-byte
|
|
encoding is at the end of the characters), the decoder remembers
|
|
enough state to continue with the next call to this function.
|
|
*/
|
|
QString QTextDecoder::toUnicode(const char *chars, int len)
|
|
{
|
|
return c->toUnicode(chars, len, &state);
|
|
}
|
|
|
|
/*! \overload
|
|
|
|
The converted string is returned in \a target.
|
|
*/
|
|
void QTextDecoder::toUnicode(QString *target, const char *chars, int len)
|
|
{
|
|
Q_ASSERT(target);
|
|
switch (c->mibEnum()) {
|
|
case 106: // utf8
|
|
static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
|
|
break;
|
|
case 4: // latin1
|
|
target->resize(len);
|
|
qt_from_latin1((char16_t*)target->data(), chars, len);
|
|
break;
|
|
default:
|
|
*target = c->toUnicode(chars, len, &state);
|
|
}
|
|
}
|
|
|
|
|
|
/*!
|
|
\overload
|
|
|
|
Converts the bytes in the byte array specified by \a ba to Unicode
|
|
and returns the result.
|
|
*/
|
|
QString QTextDecoder::toUnicode(const QByteArray &ba)
|
|
{
|
|
return c->toUnicode(ba.constData(), ba.size(), &state);
|
|
}
|
|
|
|
/*!
|
|
\since 4.4
|
|
|
|
Tries to detect the encoding of the provided snippet of HTML in
|
|
the given byte array, \a ba, by checking the BOM (Byte Order Mark)
|
|
and the content-type meta header and returns a QTextCodec instance
|
|
that is capable of decoding the html to unicode. If the codec
|
|
cannot be detected from the content provided, \a defaultCodec is
|
|
returned.
|
|
|
|
\sa codecForUtfText()
|
|
*/
|
|
QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
|
|
{
|
|
// determine charset
|
|
QTextCodec *c = QTextCodec::codecForUtfText(ba, nullptr);
|
|
if (!c) {
|
|
static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher("meta ");
|
|
QByteArray header = ba.left(1024).toLower();
|
|
qsizetype pos = matcher.indexIn(header);
|
|
if (pos != -1) {
|
|
static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher("charset=");
|
|
pos = matcher.indexIn(header, pos);
|
|
if (pos != -1) {
|
|
pos += qstrlen("charset=");
|
|
|
|
qsizetype pos2 = pos;
|
|
// The attribute can be closed with either """, "'", ">" or "/",
|
|
// none of which are valid charset characters.
|
|
while (++pos2 < header.size()) {
|
|
char ch = header.at(pos2);
|
|
if (ch == '\"' || ch == '\'' || ch == '>') {
|
|
QByteArray name = header.mid(pos, pos2 - pos);
|
|
if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
|
|
name = QByteArrayLiteral("UTF-8");
|
|
c = QTextCodec::codecForName(name);
|
|
return c ? c : defaultCodec;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (!c)
|
|
c = defaultCodec;
|
|
|
|
return c;
|
|
}
|
|
|
|
/*!
|
|
\overload
|
|
|
|
Tries to detect the encoding of the provided snippet of HTML in
|
|
the given byte array, \a ba, by checking the BOM (Byte Order Mark)
|
|
and the content-type meta header and returns a QTextCodec instance
|
|
that is capable of decoding the html to unicode. If the codec cannot
|
|
be detected, this overload returns a Latin-1 QTextCodec.
|
|
*/
|
|
QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
|
|
{
|
|
return codecForHtml(ba, QTextCodec::codecForName("ISO-8859-1"));
|
|
}
|
|
|
|
/*!
|
|
\since 4.6
|
|
|
|
Tries to detect the encoding of the provided snippet \a ba by
|
|
using the BOM (Byte Order Mark) and returns a QTextCodec instance
|
|
that is capable of decoding the text to unicode. This function can
|
|
detect one of the following codecs:
|
|
|
|
\list
|
|
\li UTF-32 Little Endian
|
|
\li UTF-32 Big Endian
|
|
\li UTF-16 Little Endian
|
|
\li UTF-16 Big Endian
|
|
\li UTF-8
|
|
\endlist
|
|
|
|
If the codec cannot be detected from the content provided, \a defaultCodec
|
|
is returned.
|
|
|
|
\sa codecForHtml()
|
|
*/
|
|
QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec)
|
|
{
|
|
const int arraySize = ba.size();
|
|
const uchar *buf = reinterpret_cast<const uchar *>(ba.constData());
|
|
const uint bom = 0xfeff;
|
|
|
|
if (arraySize > 3) {
|
|
uint uc = qFromUnaligned<uint>(buf);
|
|
if (uc == qToBigEndian(bom))
|
|
return QTextCodec::codecForMib(1018); // utf-32 be
|
|
else if (uc == qToLittleEndian(bom))
|
|
return QTextCodec::codecForMib(1019); // utf-32 le
|
|
}
|
|
|
|
if (arraySize < 2)
|
|
return defaultCodec;
|
|
|
|
ushort uc = qFromUnaligned<ushort>(buf);
|
|
if (uc == qToBigEndian(ushort(bom)))
|
|
return QTextCodec::codecForMib(1013); // utf16 be
|
|
else if (uc == qToLittleEndian(ushort(bom)))
|
|
return QTextCodec::codecForMib(1014); // utf16 le
|
|
|
|
if (arraySize < 3)
|
|
return defaultCodec;
|
|
|
|
static const char utf8bom[] = "\xef\xbb\xbf";
|
|
if (memcmp(buf, utf8bom, sizeof(utf8bom) - 1) == 0)
|
|
return QTextCodec::codecForMib(106); // utf-8
|
|
|
|
return defaultCodec;
|
|
}
|
|
|
|
/*!
|
|
\overload
|
|
|
|
Tries to detect the encoding of the provided snippet \a ba by
|
|
using the BOM (Byte Order Mark) and returns a QTextCodec instance
|
|
that is capable of decoding the text to unicode. This function can
|
|
detect one of the following codecs:
|
|
|
|
\list
|
|
\li UTF-32 Little Endian
|
|
\li UTF-32 Big Endian
|
|
\li UTF-16 Little Endian
|
|
\li UTF-16 Big Endian
|
|
\li UTF-8
|
|
\endlist
|
|
|
|
If the codec cannot be detected from the content provided, this overload
|
|
returns a Latin-1 QTextCodec.
|
|
|
|
\sa codecForHtml()
|
|
*/
|
|
QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba)
|
|
{
|
|
return codecForUtfText(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
|
|
}
|
|
|
|
/*!
|
|
\fn QTextCodec *QTextCodec::codecForTr ()
|
|
\deprecated
|
|
|
|
Returns the codec used by QObject::tr() on its argument. If this
|
|
function returns \nullptr (the default), tr() assumes Latin-1.
|
|
*/
|
|
|
|
/*!
|
|
\internal
|
|
\since 4.3
|
|
Determines whether the decoder encountered a failure while decoding the
|
|
input. If an error was encountered, the produced result is undefined, and
|
|
gets converted as according to the conversion flags.
|
|
*/
|
|
bool QTextDecoder::hasFailure() const
|
|
{
|
|
return state.invalidChars != 0;
|
|
}
|
|
|
|
/*!
|
|
\internal
|
|
\since 5.12
|
|
|
|
Determines whether the decoder needs more bytes to continue decoding. That
|
|
is, this signifies that the input string ended in the middle of a
|
|
multi-byte sequence. Note that it's possible some codecs do not report this.
|
|
*/
|
|
bool QTextDecoder::needsMoreData() const
|
|
{
|
|
return state.remainingChars;
|
|
}
|
|
|
|
/*!
|
|
\fn QTextCodec * Qt::codecForHtml(const QByteArray &ba)
|
|
\internal
|
|
|
|
This function is defined in the \c <QTextCodec> header file.
|
|
*/
|
|
QTextCodec *Qt::codecForHtml(const QByteArray &ba)
|
|
{
|
|
return QTextCodec::codecForHtml(ba);
|
|
}
|
|
|
|
QT_END_NAMESPACE
|