2022-05-10 10:06:48 +00:00
|
|
|
// Copyright (C) 2020 The Qt Company Ltd.
|
|
|
|
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
#include <qbytearray.h>
|
|
|
|
#include <qchar.h>
|
|
|
|
#include <qdebug.h>
|
2020-07-07 10:04:21 +00:00
|
|
|
#include <qfile.h>
|
|
|
|
#include <qhash.h>
|
|
|
|
#include <qlist.h>
|
2025-08-27 09:39:47 +00:00
|
|
|
#include <qspan.h>
|
2020-07-07 10:04:21 +00:00
|
|
|
#include <qstring.h>
|
2021-08-30 09:35:02 +00:00
|
|
|
#include <qbitarray.h>
|
2024-03-19 07:59:18 +00:00
|
|
|
#include <qvarlengtharray.h>
|
2021-08-19 14:29:43 +00:00
|
|
|
#include <private/qstringiterator_p.h>
|
2011-04-27 10:05:43 +00:00
|
|
|
#if 0
|
|
|
|
#include <private/qunicodetables_p.h>
|
|
|
|
#endif
|
|
|
|
|
2025-08-27 18:39:12 +00:00
|
|
|
#if QT_VERSION < QT_VERSION_CHECK(6, 9, 0)
|
|
|
|
// QSpan, QIODevice::readLineInto()
|
|
|
|
# error This tool needs Qt >= 6.9, even if you are building tables for Qt 6.5 or 6.8.
|
|
|
|
#endif
|
|
|
|
|
2025-01-17 12:03:50 +00:00
|
|
|
#define DATA_VERSION_S "16.0"
|
|
|
|
#define DATA_VERSION_STR "QChar::Unicode_16_0"
|
2011-04-27 10:05:43 +00:00
|
|
|
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
using namespace Qt::StringLiterals;
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
static QHash<QByteArray, QChar::UnicodeVersion> age_map;
|
|
|
|
|
|
|
|
static void initAgeMap()
|
|
|
|
{
|
|
|
|
struct AgeMap {
|
|
|
|
const QChar::UnicodeVersion version;
|
|
|
|
const char *age;
|
|
|
|
} ageMap[] = {
|
|
|
|
{ QChar::Unicode_1_1, "1.1" },
|
|
|
|
{ QChar::Unicode_2_0, "2.0" },
|
|
|
|
{ QChar::Unicode_2_1_2, "2.1" },
|
|
|
|
{ QChar::Unicode_3_0, "3.0" },
|
|
|
|
{ QChar::Unicode_3_1, "3.1" },
|
|
|
|
{ QChar::Unicode_3_2, "3.2" },
|
|
|
|
{ QChar::Unicode_4_0, "4.0" },
|
|
|
|
{ QChar::Unicode_4_1, "4.1" },
|
|
|
|
{ QChar::Unicode_5_0, "5.0" },
|
2012-06-03 01:17:10 +00:00
|
|
|
{ QChar::Unicode_5_1, "5.1" },
|
|
|
|
{ QChar::Unicode_5_2, "5.2" },
|
|
|
|
{ QChar::Unicode_6_0, "6.0" },
|
|
|
|
{ QChar::Unicode_6_1, "6.1" },
|
2012-09-27 22:57:39 +00:00
|
|
|
{ QChar::Unicode_6_2, "6.2" },
|
2014-01-12 19:14:25 +00:00
|
|
|
{ QChar::Unicode_6_3, "6.3" },
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
{ QChar::Unicode_7_0, "7.0" },
|
2015-11-02 04:28:14 +00:00
|
|
|
{ QChar::Unicode_8_0, "8.0" },
|
2017-12-12 08:47:53 +00:00
|
|
|
{ QChar::Unicode_9_0, "9.0" },
|
|
|
|
{ QChar::Unicode_10_0, "10.0" },
|
2019-10-23 15:17:49 +00:00
|
|
|
{ QChar::Unicode_11_0, "11.0" },
|
|
|
|
{ QChar::Unicode_12_0, "12.0" },
|
|
|
|
{ QChar::Unicode_12_1, "12.1" }, // UCD Revision 24
|
2020-03-13 16:26:53 +00:00
|
|
|
{ QChar::Unicode_13_0, "13.0" }, // UCD Revision 26
|
2021-10-04 13:06:52 +00:00
|
|
|
{ QChar::Unicode_14_0, "14.0" }, // UCD Revision 28
|
2022-10-04 11:57:44 +00:00
|
|
|
{ QChar::Unicode_15_0, "15.0" }, // UCD Revision 30
|
2024-01-25 11:18:48 +00:00
|
|
|
{ QChar::Unicode_15_1, "15.1" }, // UCD Revision 32
|
2025-01-17 12:03:50 +00:00
|
|
|
{ QChar::Unicode_16_0, "16.0" }, // UCD Revision 34
|
2011-04-27 10:05:43 +00:00
|
|
|
{ QChar::Unicode_Unassigned, 0 }
|
|
|
|
};
|
|
|
|
AgeMap *d = ageMap;
|
|
|
|
while (d->age) {
|
|
|
|
age_map.insert(d->age, d->version);
|
|
|
|
++d;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-05-06 11:44:58 +00:00
|
|
|
static const char *east_asian_width_string =
|
|
|
|
R"(enum class EastAsianWidth : unsigned int {
|
|
|
|
A,
|
|
|
|
F,
|
|
|
|
H,
|
|
|
|
N,
|
|
|
|
Na,
|
|
|
|
W,
|
|
|
|
};
|
|
|
|
|
|
|
|
)";
|
|
|
|
|
|
|
|
enum class EastAsianWidth : unsigned int {
|
|
|
|
A,
|
|
|
|
F,
|
|
|
|
H,
|
|
|
|
N,
|
|
|
|
Na,
|
|
|
|
W,
|
|
|
|
};
|
|
|
|
|
|
|
|
static QHash<QByteArray, EastAsianWidth> eastAsianWidthMap;
|
|
|
|
|
|
|
|
static void initEastAsianWidthMap()
|
|
|
|
{
|
|
|
|
constexpr struct W {
|
|
|
|
EastAsianWidth width;
|
|
|
|
const char *name;
|
|
|
|
} widths[] = {
|
|
|
|
{ EastAsianWidth::A, "A" },
|
|
|
|
{ EastAsianWidth::F, "F" },
|
|
|
|
{ EastAsianWidth::H, "H" },
|
|
|
|
{ EastAsianWidth::N, "N" },
|
|
|
|
{ EastAsianWidth::Na, "Na" },
|
|
|
|
{ EastAsianWidth::W, "W" },
|
|
|
|
};
|
|
|
|
|
|
|
|
for (auto &w : widths)
|
|
|
|
eastAsianWidthMap.insert(w.name, w.width);
|
|
|
|
}
|
|
|
|
|
2012-06-17 01:20:59 +00:00
|
|
|
static QHash<QByteArray, QChar::Category> categoryMap;
|
|
|
|
|
|
|
|
static void initCategoryMap()
|
|
|
|
{
|
|
|
|
struct Cat {
|
|
|
|
QChar::Category cat;
|
|
|
|
const char *name;
|
|
|
|
} categories[] = {
|
|
|
|
{ QChar::Mark_NonSpacing, "Mn" },
|
|
|
|
{ QChar::Mark_SpacingCombining, "Mc" },
|
|
|
|
{ QChar::Mark_Enclosing, "Me" },
|
|
|
|
|
|
|
|
{ QChar::Number_DecimalDigit, "Nd" },
|
|
|
|
{ QChar::Number_Letter, "Nl" },
|
|
|
|
{ QChar::Number_Other, "No" },
|
|
|
|
|
|
|
|
{ QChar::Separator_Space, "Zs" },
|
|
|
|
{ QChar::Separator_Line, "Zl" },
|
|
|
|
{ QChar::Separator_Paragraph, "Zp" },
|
|
|
|
|
|
|
|
{ QChar::Other_Control, "Cc" },
|
|
|
|
{ QChar::Other_Format, "Cf" },
|
|
|
|
{ QChar::Other_Surrogate, "Cs" },
|
|
|
|
{ QChar::Other_PrivateUse, "Co" },
|
|
|
|
{ QChar::Other_NotAssigned, "Cn" },
|
|
|
|
|
|
|
|
{ QChar::Letter_Uppercase, "Lu" },
|
|
|
|
{ QChar::Letter_Lowercase, "Ll" },
|
|
|
|
{ QChar::Letter_Titlecase, "Lt" },
|
|
|
|
{ QChar::Letter_Modifier, "Lm" },
|
|
|
|
{ QChar::Letter_Other, "Lo" },
|
|
|
|
|
|
|
|
{ QChar::Punctuation_Connector, "Pc" },
|
|
|
|
{ QChar::Punctuation_Dash, "Pd" },
|
|
|
|
{ QChar::Punctuation_Open, "Ps" },
|
|
|
|
{ QChar::Punctuation_Close, "Pe" },
|
|
|
|
{ QChar::Punctuation_InitialQuote, "Pi" },
|
|
|
|
{ QChar::Punctuation_FinalQuote, "Pf" },
|
|
|
|
{ QChar::Punctuation_Other, "Po" },
|
|
|
|
|
|
|
|
{ QChar::Symbol_Math, "Sm" },
|
|
|
|
{ QChar::Symbol_Currency, "Sc" },
|
|
|
|
{ QChar::Symbol_Modifier, "Sk" },
|
|
|
|
{ QChar::Symbol_Other, "So" },
|
|
|
|
{ QChar::Other_NotAssigned, 0 }
|
|
|
|
};
|
|
|
|
Cat *c = categories;
|
|
|
|
while (c->name) {
|
|
|
|
categoryMap.insert(c->name, c->cat);
|
|
|
|
++c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static QHash<QByteArray, QChar::Decomposition> decompositionMap;
|
|
|
|
|
|
|
|
static void initDecompositionMap()
|
|
|
|
{
|
|
|
|
struct Dec {
|
|
|
|
QChar::Decomposition dec;
|
|
|
|
const char *name;
|
|
|
|
} decompositions[] = {
|
|
|
|
{ QChar::Canonical, "<canonical>" },
|
|
|
|
{ QChar::Font, "<font>" },
|
|
|
|
{ QChar::NoBreak, "<noBreak>" },
|
|
|
|
{ QChar::Initial, "<initial>" },
|
|
|
|
{ QChar::Medial, "<medial>" },
|
|
|
|
{ QChar::Final, "<final>" },
|
|
|
|
{ QChar::Isolated, "<isolated>" },
|
|
|
|
{ QChar::Circle, "<circle>" },
|
|
|
|
{ QChar::Super, "<super>" },
|
|
|
|
{ QChar::Sub, "<sub>" },
|
|
|
|
{ QChar::Vertical, "<vertical>" },
|
|
|
|
{ QChar::Wide, "<wide>" },
|
|
|
|
{ QChar::Narrow, "<narrow>" },
|
|
|
|
{ QChar::Small, "<small>" },
|
|
|
|
{ QChar::Square, "<square>" },
|
|
|
|
{ QChar::Compat, "<compat>" },
|
|
|
|
{ QChar::Fraction, "<fraction>" },
|
|
|
|
{ QChar::NoDecomposition, 0 }
|
|
|
|
};
|
|
|
|
Dec *d = decompositions;
|
|
|
|
while (d->name) {
|
|
|
|
decompositionMap.insert(d->name, d->dec);
|
|
|
|
++d;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-01-12 19:14:25 +00:00
|
|
|
enum Direction {
|
|
|
|
DirL = QChar::DirL,
|
|
|
|
DirR = QChar::DirR,
|
|
|
|
DirEN = QChar::DirEN,
|
|
|
|
DirES = QChar::DirES,
|
|
|
|
DirET = QChar::DirET,
|
|
|
|
DirAN = QChar::DirAN,
|
|
|
|
DirCS = QChar::DirCS,
|
|
|
|
DirB = QChar::DirB,
|
|
|
|
DirS = QChar::DirS,
|
|
|
|
DirWS = QChar::DirWS,
|
|
|
|
DirON = QChar::DirON,
|
|
|
|
DirLRE = QChar::DirLRE,
|
|
|
|
DirLRO = QChar::DirLRO,
|
|
|
|
DirAL = QChar::DirAL,
|
|
|
|
DirRLE = QChar::DirRLE,
|
|
|
|
DirRLO = QChar::DirRLO,
|
|
|
|
DirPDF = QChar::DirPDF,
|
|
|
|
DirNSM = QChar::DirNSM,
|
|
|
|
DirBN = QChar::DirBN,
|
|
|
|
DirLRI = QChar::DirLRI,
|
|
|
|
DirRLI = QChar::DirRLI,
|
|
|
|
DirFSI = QChar::DirFSI,
|
2019-11-27 13:40:33 +00:00
|
|
|
DirPDI = QChar::DirPDI,
|
2014-01-12 19:14:25 +00:00
|
|
|
|
2019-11-27 13:40:33 +00:00
|
|
|
Dir_Unassigned
|
2014-01-12 19:14:25 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static QHash<QByteArray, Direction> directionMap;
|
2012-06-17 01:20:59 +00:00
|
|
|
|
|
|
|
static void initDirectionMap()
|
|
|
|
{
|
|
|
|
struct Dir {
|
2014-01-12 19:14:25 +00:00
|
|
|
Direction dir;
|
2012-06-17 01:20:59 +00:00
|
|
|
const char *name;
|
|
|
|
} directions[] = {
|
2014-01-12 19:14:25 +00:00
|
|
|
{ DirL, "L" },
|
|
|
|
{ DirR, "R" },
|
|
|
|
{ DirEN, "EN" },
|
|
|
|
{ DirES, "ES" },
|
|
|
|
{ DirET, "ET" },
|
|
|
|
{ DirAN, "AN" },
|
|
|
|
{ DirCS, "CS" },
|
|
|
|
{ DirB, "B" },
|
|
|
|
{ DirS, "S" },
|
|
|
|
{ DirWS, "WS" },
|
|
|
|
{ DirON, "ON" },
|
|
|
|
{ DirLRE, "LRE" },
|
|
|
|
{ DirLRO, "LRO" },
|
|
|
|
{ DirAL, "AL" },
|
|
|
|
{ DirRLE, "RLE" },
|
|
|
|
{ DirRLO, "RLO" },
|
|
|
|
{ DirPDF, "PDF" },
|
|
|
|
{ DirNSM, "NSM" },
|
|
|
|
{ DirBN, "BN" },
|
|
|
|
{ DirLRI, "LRI" },
|
|
|
|
{ DirRLI, "RLI" },
|
|
|
|
{ DirFSI, "FSI" },
|
|
|
|
{ DirPDI, "PDI" },
|
|
|
|
{ Dir_Unassigned, 0 }
|
2012-06-17 01:20:59 +00:00
|
|
|
};
|
|
|
|
Dir *d = directions;
|
|
|
|
while (d->name) {
|
|
|
|
directionMap.insert(d->name, d->dir);
|
|
|
|
++d;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-01-26 00:42:37 +00:00
|
|
|
enum JoiningType {
|
2011-04-27 10:05:43 +00:00
|
|
|
Joining_None,
|
|
|
|
Joining_Causing,
|
|
|
|
Joining_Dual,
|
|
|
|
Joining_Right,
|
2014-01-26 00:42:37 +00:00
|
|
|
Joining_Left,
|
2019-11-27 13:40:33 +00:00
|
|
|
Joining_Transparent,
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2019-11-27 13:40:33 +00:00
|
|
|
Joining_Unassigned
|
2011-04-27 10:05:43 +00:00
|
|
|
};
|
|
|
|
|
2014-01-26 00:42:37 +00:00
|
|
|
static QHash<QByteArray, JoiningType> joining_map;
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
static void initJoiningMap()
|
|
|
|
{
|
|
|
|
struct JoiningList {
|
2014-01-26 00:42:37 +00:00
|
|
|
JoiningType joining;
|
2011-04-27 10:05:43 +00:00
|
|
|
const char *name;
|
|
|
|
} joinings[] = {
|
|
|
|
{ Joining_None, "U" },
|
|
|
|
{ Joining_Causing, "C" },
|
|
|
|
{ Joining_Dual, "D" },
|
|
|
|
{ Joining_Right, "R" },
|
2014-01-26 00:42:37 +00:00
|
|
|
{ Joining_Left, "L" },
|
2011-04-27 10:05:43 +00:00
|
|
|
{ Joining_Transparent, "T" },
|
|
|
|
{ Joining_Unassigned, 0 }
|
|
|
|
};
|
|
|
|
JoiningList *d = joinings;
|
|
|
|
while (d->name) {
|
|
|
|
joining_map.insert(d->name, d->joining);
|
|
|
|
++d;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-06-17 01:55:07 +00:00
|
|
|
static const char *grapheme_break_class_string =
|
|
|
|
"enum GraphemeBreakClass {\n"
|
2017-12-12 09:14:28 +00:00
|
|
|
" GraphemeBreak_Any,\n"
|
2012-06-17 01:55:07 +00:00
|
|
|
" GraphemeBreak_CR,\n"
|
|
|
|
" GraphemeBreak_LF,\n"
|
|
|
|
" GraphemeBreak_Control,\n"
|
|
|
|
" GraphemeBreak_Extend,\n"
|
2017-12-12 09:14:28 +00:00
|
|
|
" GraphemeBreak_ZWJ,\n"
|
2012-09-27 22:57:39 +00:00
|
|
|
" GraphemeBreak_RegionalIndicator,\n"
|
2012-06-17 01:55:07 +00:00
|
|
|
" GraphemeBreak_Prepend,\n"
|
|
|
|
" GraphemeBreak_SpacingMark,\n"
|
|
|
|
" GraphemeBreak_L,\n"
|
|
|
|
" GraphemeBreak_V,\n"
|
|
|
|
" GraphemeBreak_T,\n"
|
|
|
|
" GraphemeBreak_LV,\n"
|
2017-12-12 09:14:28 +00:00
|
|
|
" GraphemeBreak_LVT,\n"
|
2021-04-15 12:39:51 +00:00
|
|
|
" GraphemeBreak_Extended_Pictographic,\n"
|
2019-11-27 13:40:33 +00:00
|
|
|
"\n"
|
2019-11-26 14:52:17 +00:00
|
|
|
" NumGraphemeBreakClasses\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
"};\n\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-06-17 01:55:07 +00:00
|
|
|
enum GraphemeBreakClass {
|
2017-12-12 09:14:28 +00:00
|
|
|
GraphemeBreak_Any,
|
2012-06-17 01:55:07 +00:00
|
|
|
GraphemeBreak_CR,
|
|
|
|
GraphemeBreak_LF,
|
|
|
|
GraphemeBreak_Control,
|
|
|
|
GraphemeBreak_Extend,
|
2017-12-12 09:14:28 +00:00
|
|
|
GraphemeBreak_ZWJ,
|
2012-09-27 22:57:39 +00:00
|
|
|
GraphemeBreak_RegionalIndicator,
|
2012-06-17 01:55:07 +00:00
|
|
|
GraphemeBreak_Prepend,
|
|
|
|
GraphemeBreak_SpacingMark,
|
|
|
|
GraphemeBreak_L,
|
|
|
|
GraphemeBreak_V,
|
|
|
|
GraphemeBreak_T,
|
|
|
|
GraphemeBreak_LV,
|
2017-12-12 09:14:28 +00:00
|
|
|
GraphemeBreak_LVT,
|
2021-04-15 12:39:51 +00:00
|
|
|
GraphemeBreak_Extended_Pictographic,
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2017-12-12 09:14:28 +00:00
|
|
|
GraphemeBreak_Unassigned
|
2011-04-27 10:05:43 +00:00
|
|
|
};
|
|
|
|
|
2012-06-17 01:55:07 +00:00
|
|
|
static QHash<QByteArray, GraphemeBreakClass> grapheme_break_map;
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
static void initGraphemeBreak()
|
|
|
|
{
|
|
|
|
struct GraphemeBreakList {
|
2012-06-17 01:55:07 +00:00
|
|
|
GraphemeBreakClass brk;
|
2011-04-27 10:05:43 +00:00
|
|
|
const char *name;
|
|
|
|
} breaks[] = {
|
2017-12-12 09:14:28 +00:00
|
|
|
{ GraphemeBreak_Any, "Any" },
|
2012-06-17 01:55:07 +00:00
|
|
|
{ GraphemeBreak_CR, "CR" },
|
|
|
|
{ GraphemeBreak_LF, "LF" },
|
|
|
|
{ GraphemeBreak_Control, "Control" },
|
|
|
|
{ GraphemeBreak_Extend, "Extend" },
|
2017-12-12 09:14:28 +00:00
|
|
|
{ GraphemeBreak_ZWJ, "ZWJ" },
|
2012-09-27 22:57:39 +00:00
|
|
|
{ GraphemeBreak_RegionalIndicator, "Regional_Indicator" },
|
2012-06-17 01:55:07 +00:00
|
|
|
{ GraphemeBreak_Prepend, "Prepend" },
|
|
|
|
{ GraphemeBreak_SpacingMark, "SpacingMark" },
|
|
|
|
{ GraphemeBreak_L, "L" },
|
|
|
|
{ GraphemeBreak_V, "V" },
|
|
|
|
{ GraphemeBreak_T, "T" },
|
|
|
|
{ GraphemeBreak_LV, "LV" },
|
|
|
|
{ GraphemeBreak_LVT, "LVT" },
|
2021-04-15 12:39:51 +00:00
|
|
|
{ GraphemeBreak_Extended_Pictographic, "Extended_Pictographic" },
|
|
|
|
{ GraphemeBreak_Unassigned, nullptr }
|
2011-04-27 10:05:43 +00:00
|
|
|
};
|
|
|
|
GraphemeBreakList *d = breaks;
|
|
|
|
while (d->name) {
|
|
|
|
grapheme_break_map.insert(d->name, d->brk);
|
|
|
|
++d;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-06-17 01:55:07 +00:00
|
|
|
static const char *word_break_class_string =
|
|
|
|
"enum WordBreakClass {\n"
|
2017-12-12 09:14:28 +00:00
|
|
|
" WordBreak_Any,\n"
|
2012-06-17 01:55:07 +00:00
|
|
|
" WordBreak_CR,\n"
|
|
|
|
" WordBreak_LF,\n"
|
|
|
|
" WordBreak_Newline,\n"
|
|
|
|
" WordBreak_Extend,\n"
|
2017-12-12 09:14:28 +00:00
|
|
|
" WordBreak_ZWJ,\n"
|
|
|
|
" WordBreak_Format,\n"
|
2012-09-27 22:57:39 +00:00
|
|
|
" WordBreak_RegionalIndicator,\n"
|
2012-06-17 01:55:07 +00:00
|
|
|
" WordBreak_Katakana,\n"
|
2014-01-12 19:14:25 +00:00
|
|
|
" WordBreak_HebrewLetter,\n"
|
2012-06-17 01:55:07 +00:00
|
|
|
" WordBreak_ALetter,\n"
|
2014-01-12 19:14:25 +00:00
|
|
|
" WordBreak_SingleQuote,\n"
|
|
|
|
" WordBreak_DoubleQuote,\n"
|
2012-06-17 01:55:07 +00:00
|
|
|
" WordBreak_MidNumLet,\n"
|
|
|
|
" WordBreak_MidLetter,\n"
|
|
|
|
" WordBreak_MidNum,\n"
|
|
|
|
" WordBreak_Numeric,\n"
|
2017-12-12 09:14:28 +00:00
|
|
|
" WordBreak_ExtendNumLet,\n"
|
2019-10-23 15:17:49 +00:00
|
|
|
" WordBreak_WSegSpace,\n"
|
2019-11-27 13:40:33 +00:00
|
|
|
"\n"
|
2019-11-26 14:52:17 +00:00
|
|
|
" NumWordBreakClasses\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
"};\n\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-06-17 01:55:07 +00:00
|
|
|
enum WordBreakClass {
|
2017-12-12 09:14:28 +00:00
|
|
|
WordBreak_Any,
|
2012-06-17 01:55:07 +00:00
|
|
|
WordBreak_CR,
|
|
|
|
WordBreak_LF,
|
|
|
|
WordBreak_Newline,
|
|
|
|
WordBreak_Extend,
|
2017-12-12 09:14:28 +00:00
|
|
|
WordBreak_ZWJ,
|
|
|
|
WordBreak_Format,
|
2012-09-27 22:57:39 +00:00
|
|
|
WordBreak_RegionalIndicator,
|
2012-06-17 01:55:07 +00:00
|
|
|
WordBreak_Katakana,
|
2014-01-12 19:14:25 +00:00
|
|
|
WordBreak_HebrewLetter,
|
2012-06-17 01:55:07 +00:00
|
|
|
WordBreak_ALetter,
|
2014-01-12 19:14:25 +00:00
|
|
|
WordBreak_SingleQuote,
|
|
|
|
WordBreak_DoubleQuote,
|
2012-06-17 01:55:07 +00:00
|
|
|
WordBreak_MidNumLet,
|
|
|
|
WordBreak_MidLetter,
|
|
|
|
WordBreak_MidNum,
|
|
|
|
WordBreak_Numeric,
|
2017-12-12 09:14:28 +00:00
|
|
|
WordBreak_ExtendNumLet,
|
2019-10-23 15:17:49 +00:00
|
|
|
WordBreak_WSegSpace,
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2017-12-12 09:14:28 +00:00
|
|
|
WordBreak_Unassigned
|
2011-04-27 10:05:43 +00:00
|
|
|
};
|
|
|
|
|
2012-06-17 01:55:07 +00:00
|
|
|
static QHash<QByteArray, WordBreakClass> word_break_map;
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
static void initWordBreak()
|
|
|
|
{
|
|
|
|
struct WordBreakList {
|
2012-06-17 01:55:07 +00:00
|
|
|
WordBreakClass brk;
|
2011-04-27 10:05:43 +00:00
|
|
|
const char *name;
|
|
|
|
} breaks[] = {
|
2017-12-12 09:14:28 +00:00
|
|
|
{ WordBreak_Any, "Any" },
|
2012-06-17 01:55:07 +00:00
|
|
|
{ WordBreak_CR, "CR" },
|
|
|
|
{ WordBreak_LF, "LF" },
|
|
|
|
{ WordBreak_Newline, "Newline" },
|
|
|
|
{ WordBreak_Extend, "Extend" },
|
2017-12-12 09:14:28 +00:00
|
|
|
{ WordBreak_ZWJ, "ZWJ" },
|
|
|
|
{ WordBreak_Format, "Format" },
|
2012-09-27 22:57:39 +00:00
|
|
|
{ WordBreak_RegionalIndicator, "Regional_Indicator" },
|
2012-06-17 01:55:07 +00:00
|
|
|
{ WordBreak_Katakana, "Katakana" },
|
2014-01-12 19:14:25 +00:00
|
|
|
{ WordBreak_HebrewLetter, "Hebrew_Letter" },
|
2012-06-17 01:55:07 +00:00
|
|
|
{ WordBreak_ALetter, "ALetter" },
|
2014-01-12 19:14:25 +00:00
|
|
|
{ WordBreak_SingleQuote, "Single_Quote" },
|
|
|
|
{ WordBreak_DoubleQuote, "Double_Quote" },
|
2012-06-17 01:55:07 +00:00
|
|
|
{ WordBreak_MidNumLet, "MidNumLet" },
|
|
|
|
{ WordBreak_MidLetter, "MidLetter" },
|
|
|
|
{ WordBreak_MidNum, "MidNum" },
|
|
|
|
{ WordBreak_Numeric, "Numeric" },
|
|
|
|
{ WordBreak_ExtendNumLet, "ExtendNumLet" },
|
2019-10-23 15:17:49 +00:00
|
|
|
{ WordBreak_WSegSpace, "WSegSpace" },
|
2011-04-27 10:05:43 +00:00
|
|
|
{ WordBreak_Unassigned, 0 }
|
|
|
|
};
|
|
|
|
WordBreakList *d = breaks;
|
|
|
|
while (d->name) {
|
|
|
|
word_break_map.insert(d->name, d->brk);
|
|
|
|
++d;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-06-17 01:55:07 +00:00
|
|
|
static const char *sentence_break_class_string =
|
|
|
|
"enum SentenceBreakClass {\n"
|
2017-12-12 09:14:28 +00:00
|
|
|
" SentenceBreak_Any,\n"
|
2012-06-17 01:55:07 +00:00
|
|
|
" SentenceBreak_CR,\n"
|
|
|
|
" SentenceBreak_LF,\n"
|
|
|
|
" SentenceBreak_Sep,\n"
|
|
|
|
" SentenceBreak_Extend,\n"
|
|
|
|
" SentenceBreak_Sp,\n"
|
|
|
|
" SentenceBreak_Lower,\n"
|
|
|
|
" SentenceBreak_Upper,\n"
|
|
|
|
" SentenceBreak_OLetter,\n"
|
|
|
|
" SentenceBreak_Numeric,\n"
|
|
|
|
" SentenceBreak_ATerm,\n"
|
|
|
|
" SentenceBreak_SContinue,\n"
|
|
|
|
" SentenceBreak_STerm,\n"
|
2017-12-12 09:14:28 +00:00
|
|
|
" SentenceBreak_Close,\n"
|
2019-11-27 13:40:33 +00:00
|
|
|
"\n"
|
2017-12-12 09:14:28 +00:00
|
|
|
" NumSentenceBreakClasses\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
"};\n\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-06-17 01:55:07 +00:00
|
|
|
enum SentenceBreakClass {
|
2017-12-12 09:14:28 +00:00
|
|
|
SentenceBreak_Any,
|
2012-06-17 01:55:07 +00:00
|
|
|
SentenceBreak_CR,
|
|
|
|
SentenceBreak_LF,
|
|
|
|
SentenceBreak_Sep,
|
|
|
|
SentenceBreak_Extend,
|
|
|
|
SentenceBreak_Sp,
|
|
|
|
SentenceBreak_Lower,
|
|
|
|
SentenceBreak_Upper,
|
|
|
|
SentenceBreak_OLetter,
|
|
|
|
SentenceBreak_Numeric,
|
|
|
|
SentenceBreak_ATerm,
|
|
|
|
SentenceBreak_SContinue,
|
|
|
|
SentenceBreak_STerm,
|
2019-11-27 13:40:33 +00:00
|
|
|
SentenceBreak_Close,
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2019-11-27 13:40:33 +00:00
|
|
|
SentenceBreak_Unassigned
|
2011-04-27 10:05:43 +00:00
|
|
|
};
|
|
|
|
|
2012-06-17 01:55:07 +00:00
|
|
|
static QHash<QByteArray, SentenceBreakClass> sentence_break_map;
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
static void initSentenceBreak()
|
|
|
|
{
|
|
|
|
struct SentenceBreakList {
|
2012-06-17 01:55:07 +00:00
|
|
|
SentenceBreakClass brk;
|
2011-04-27 10:05:43 +00:00
|
|
|
const char *name;
|
|
|
|
} breaks[] = {
|
2017-12-12 09:14:28 +00:00
|
|
|
{ SentenceBreak_Any, "Any" },
|
2012-06-17 01:55:07 +00:00
|
|
|
{ SentenceBreak_CR, "CR" },
|
|
|
|
{ SentenceBreak_LF, "LF" },
|
|
|
|
{ SentenceBreak_Sep, "Sep" },
|
|
|
|
{ SentenceBreak_Extend, "Extend" },
|
|
|
|
{ SentenceBreak_Extend, "Format" },
|
|
|
|
{ SentenceBreak_Sp, "Sp" },
|
|
|
|
{ SentenceBreak_Lower, "Lower" },
|
|
|
|
{ SentenceBreak_Upper, "Upper" },
|
|
|
|
{ SentenceBreak_OLetter, "OLetter" },
|
|
|
|
{ SentenceBreak_Numeric, "Numeric" },
|
|
|
|
{ SentenceBreak_ATerm, "ATerm" },
|
|
|
|
{ SentenceBreak_SContinue, "SContinue" },
|
|
|
|
{ SentenceBreak_STerm, "STerm" },
|
|
|
|
{ SentenceBreak_Close, "Close" },
|
2011-04-27 10:05:43 +00:00
|
|
|
{ SentenceBreak_Unassigned, 0 }
|
|
|
|
};
|
|
|
|
SentenceBreakList *d = breaks;
|
|
|
|
while (d->name) {
|
|
|
|
sentence_break_map.insert(d->name, d->brk);
|
|
|
|
++d;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-04-23 03:00:16 +00:00
|
|
|
static const char *line_break_class_string =
|
2012-09-27 22:57:39 +00:00
|
|
|
"// see http://www.unicode.org/reports/tr14/tr14-30.html\n"
|
2025-01-17 12:03:50 +00:00
|
|
|
"// we don't use the XX and AI classes but map them to AL instead.\n"
|
2024-01-25 11:18:48 +00:00
|
|
|
"// VI and VF classes are mapped to CM.\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
"enum LineBreakClass {\n"
|
2024-01-29 16:51:42 +00:00
|
|
|
" LineBreak_OP, LineBreak_CL, LineBreak_CP,\n"
|
2025-01-17 12:03:50 +00:00
|
|
|
" LineBreak_QU, LineBreak_QU_Pi, LineBreak_QU_Pf, LineBreak_QU_19,\n"
|
|
|
|
" LineBreak_GL, LineBreak_NS, LineBreak_EX, LineBreak_SY,\n"
|
|
|
|
" LineBreak_IS, LineBreak_PR,\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
" LineBreak_PO, LineBreak_NU, LineBreak_AL, LineBreak_HL, LineBreak_ID,\n"
|
2025-01-17 12:03:50 +00:00
|
|
|
" LineBreak_IN, LineBreak_HY, LineBreak_WS_HY,\n"
|
|
|
|
" LineBreak_BA, LineBreak_WS_BA,\n"
|
|
|
|
" LineBreak_HYBA,\n"
|
|
|
|
" LineBreak_BB, LineBreak_B2,\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
" LineBreak_ZW, LineBreak_CM, LineBreak_WJ, LineBreak_H2, LineBreak_H3,\n"
|
2012-09-27 22:57:39 +00:00
|
|
|
" LineBreak_JL, LineBreak_JV, LineBreak_JT, LineBreak_RI, LineBreak_CB,\n"
|
2025-01-17 12:03:50 +00:00
|
|
|
" LineBreak_EB, LineBreak_EM,\n"
|
|
|
|
"\n"
|
|
|
|
" LineBreak_AK, LineBreak_AP, LineBreak_AS,\n"
|
|
|
|
" LineBreak_VI, LineBreak_VF,\n"
|
|
|
|
"\n"
|
|
|
|
" LineBreak_ZWJ,\n"
|
2017-12-12 09:14:28 +00:00
|
|
|
" LineBreak_SA, LineBreak_SG, LineBreak_SP,\n"
|
|
|
|
" LineBreak_CR, LineBreak_LF, LineBreak_BK,\n"
|
2019-11-27 13:40:33 +00:00
|
|
|
"\n"
|
2017-12-12 09:14:28 +00:00
|
|
|
" NumLineBreakClasses\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
"};\n\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
enum LineBreakClass {
|
2024-01-29 16:51:42 +00:00
|
|
|
LineBreak_OP, LineBreak_CL, LineBreak_CP,
|
2025-01-17 12:03:50 +00:00
|
|
|
LineBreak_QU, LineBreak_QU_Pi, LineBreak_QU_Pf, LineBreak_QU_19,
|
|
|
|
LineBreak_GL, LineBreak_NS, LineBreak_EX, LineBreak_SY,
|
|
|
|
LineBreak_IS, LineBreak_PR,
|
2012-06-03 01:17:10 +00:00
|
|
|
LineBreak_PO, LineBreak_NU, LineBreak_AL, LineBreak_HL, LineBreak_ID,
|
2025-01-17 12:03:50 +00:00
|
|
|
LineBreak_IN, LineBreak_HY, LineBreak_WS_HY,
|
|
|
|
LineBreak_BA, LineBreak_WS_BA,
|
|
|
|
LineBreak_HYBA,
|
|
|
|
LineBreak_BB, LineBreak_B2,
|
2012-06-03 01:17:10 +00:00
|
|
|
LineBreak_ZW, LineBreak_CM, LineBreak_WJ, LineBreak_H2, LineBreak_H3,
|
2012-09-27 22:57:39 +00:00
|
|
|
LineBreak_JL, LineBreak_JV, LineBreak_JT, LineBreak_RI, LineBreak_CB,
|
2025-01-17 12:03:50 +00:00
|
|
|
LineBreak_EB, LineBreak_EM,
|
|
|
|
|
|
|
|
LineBreak_AK, LineBreak_AP, LineBreak_AS,
|
|
|
|
LineBreak_VI, LineBreak_VF,
|
|
|
|
|
|
|
|
LineBreak_ZWJ,
|
2017-12-12 09:14:28 +00:00
|
|
|
LineBreak_SA, LineBreak_SG, LineBreak_SP,
|
|
|
|
LineBreak_CR, LineBreak_LF, LineBreak_BK,
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2017-12-12 09:14:28 +00:00
|
|
|
LineBreak_Unassigned
|
2011-04-27 10:05:43 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static QHash<QByteArray, LineBreakClass> line_break_map;
|
|
|
|
|
|
|
|
static void initLineBreak()
|
|
|
|
{
|
2012-06-03 01:17:10 +00:00
|
|
|
// ### Classes XX and AI are left out and mapped to AL for now.
|
|
|
|
// ### Class NL is mapped to BK.
|
|
|
|
// ### Treating characters of class CJ as class NS will give CSS strict line breaking;
|
|
|
|
// treating them as class ID will give CSS normal breaking.
|
2011-04-27 10:05:43 +00:00
|
|
|
struct LineBreakList {
|
|
|
|
LineBreakClass brk;
|
|
|
|
const char *name;
|
|
|
|
} breaks[] = {
|
|
|
|
{ LineBreak_BK, "BK" },
|
|
|
|
{ LineBreak_CR, "CR" },
|
|
|
|
{ LineBreak_LF, "LF" },
|
|
|
|
{ LineBreak_CM, "CM" },
|
2012-06-03 01:17:10 +00:00
|
|
|
{ LineBreak_BK, "NL" },
|
2011-04-27 10:05:43 +00:00
|
|
|
{ LineBreak_SG, "SG" },
|
|
|
|
{ LineBreak_WJ, "WJ" },
|
|
|
|
{ LineBreak_ZW, "ZW" },
|
|
|
|
{ LineBreak_GL, "GL" },
|
|
|
|
{ LineBreak_SP, "SP" },
|
|
|
|
{ LineBreak_B2, "B2" },
|
|
|
|
{ LineBreak_BA, "BA" },
|
|
|
|
{ LineBreak_BB, "BB" },
|
|
|
|
{ LineBreak_HY, "HY" },
|
2012-05-31 10:04:32 +00:00
|
|
|
{ LineBreak_CB, "CB" },
|
2012-06-03 01:17:10 +00:00
|
|
|
{ LineBreak_NS, "CJ" },
|
2011-04-27 10:05:43 +00:00
|
|
|
{ LineBreak_CL, "CL" },
|
2012-06-03 01:17:10 +00:00
|
|
|
{ LineBreak_CP, "CP" },
|
2011-04-27 10:05:43 +00:00
|
|
|
{ LineBreak_EX, "EX" },
|
|
|
|
{ LineBreak_IN, "IN" },
|
|
|
|
{ LineBreak_NS, "NS" },
|
|
|
|
{ LineBreak_OP, "OP" },
|
|
|
|
{ LineBreak_QU, "QU" },
|
|
|
|
{ LineBreak_IS, "IS" },
|
|
|
|
{ LineBreak_NU, "NU" },
|
|
|
|
{ LineBreak_PO, "PO" },
|
|
|
|
{ LineBreak_PR, "PR" },
|
|
|
|
{ LineBreak_SY, "SY" },
|
|
|
|
{ LineBreak_AL, "AI" },
|
|
|
|
{ LineBreak_AL, "AL" },
|
2012-06-03 01:17:10 +00:00
|
|
|
{ LineBreak_HL, "HL" },
|
2011-04-27 10:05:43 +00:00
|
|
|
{ LineBreak_H2, "H2" },
|
|
|
|
{ LineBreak_H3, "H3" },
|
|
|
|
{ LineBreak_ID, "ID" },
|
|
|
|
{ LineBreak_JL, "JL" },
|
|
|
|
{ LineBreak_JV, "JV" },
|
|
|
|
{ LineBreak_JT, "JT" },
|
2012-09-27 22:57:39 +00:00
|
|
|
{ LineBreak_RI, "RI" },
|
2011-04-27 10:05:43 +00:00
|
|
|
{ LineBreak_SA, "SA" },
|
|
|
|
{ LineBreak_AL, "XX" },
|
2017-12-12 09:14:28 +00:00
|
|
|
{ LineBreak_EB, "EB" },
|
|
|
|
{ LineBreak_EM, "EM" },
|
|
|
|
{ LineBreak_ZWJ, "ZWJ" },
|
2025-01-17 12:03:50 +00:00
|
|
|
{ LineBreak_AK, "AK" },
|
|
|
|
{ LineBreak_AP, "AP" },
|
|
|
|
{ LineBreak_AS, "AS" },
|
|
|
|
{ LineBreak_VI, "VI" },
|
|
|
|
{ LineBreak_VF, "VF" },
|
2011-04-27 10:05:43 +00:00
|
|
|
{ LineBreak_Unassigned, 0 }
|
|
|
|
};
|
|
|
|
LineBreakList *d = breaks;
|
|
|
|
while (d->name) {
|
|
|
|
line_break_map.insert(d->name, d->brk);
|
|
|
|
++d;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-12-08 03:36:49 +00:00
|
|
|
static QHash<QByteArray, QChar::Script> scriptMap;
|
|
|
|
|
|
|
|
static void initScriptMap()
|
|
|
|
{
|
|
|
|
struct Scrpt {
|
|
|
|
QChar::Script script;
|
|
|
|
const char *name;
|
|
|
|
} scripts[] = {
|
|
|
|
// general
|
|
|
|
{ QChar::Script_Unknown, "Unknown" },
|
|
|
|
{ QChar::Script_Inherited, "Inherited" },
|
|
|
|
{ QChar::Script_Common, "Common" },
|
|
|
|
// pre-4.0
|
|
|
|
{ QChar::Script_Latin, "Latin" },
|
|
|
|
{ QChar::Script_Greek, "Greek" },
|
|
|
|
{ QChar::Script_Cyrillic, "Cyrillic" },
|
|
|
|
{ QChar::Script_Armenian, "Armenian" },
|
|
|
|
{ QChar::Script_Hebrew, "Hebrew" },
|
|
|
|
{ QChar::Script_Arabic, "Arabic" },
|
|
|
|
{ QChar::Script_Syriac, "Syriac" },
|
|
|
|
{ QChar::Script_Thaana, "Thaana" },
|
|
|
|
{ QChar::Script_Devanagari, "Devanagari" },
|
|
|
|
{ QChar::Script_Bengali, "Bengali" },
|
|
|
|
{ QChar::Script_Gurmukhi, "Gurmukhi" },
|
|
|
|
{ QChar::Script_Gujarati, "Gujarati" },
|
|
|
|
{ QChar::Script_Oriya, "Oriya" },
|
|
|
|
{ QChar::Script_Tamil, "Tamil" },
|
|
|
|
{ QChar::Script_Telugu, "Telugu" },
|
|
|
|
{ QChar::Script_Kannada, "Kannada" },
|
|
|
|
{ QChar::Script_Malayalam, "Malayalam" },
|
|
|
|
{ QChar::Script_Sinhala, "Sinhala" },
|
|
|
|
{ QChar::Script_Thai, "Thai" },
|
|
|
|
{ QChar::Script_Lao, "Lao" },
|
|
|
|
{ QChar::Script_Tibetan, "Tibetan" },
|
|
|
|
{ QChar::Script_Myanmar, "Myanmar" },
|
|
|
|
{ QChar::Script_Georgian, "Georgian" },
|
|
|
|
{ QChar::Script_Hangul, "Hangul" },
|
|
|
|
{ QChar::Script_Ethiopic, "Ethiopic" },
|
|
|
|
{ QChar::Script_Cherokee, "Cherokee" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_CanadianAboriginal, "Canadian_Aboriginal" },
|
2012-12-08 03:36:49 +00:00
|
|
|
{ QChar::Script_Ogham, "Ogham" },
|
|
|
|
{ QChar::Script_Runic, "Runic" },
|
|
|
|
{ QChar::Script_Khmer, "Khmer" },
|
|
|
|
{ QChar::Script_Mongolian, "Mongolian" },
|
|
|
|
{ QChar::Script_Hiragana, "Hiragana" },
|
|
|
|
{ QChar::Script_Katakana, "Katakana" },
|
|
|
|
{ QChar::Script_Bopomofo, "Bopomofo" },
|
|
|
|
{ QChar::Script_Han, "Han" },
|
|
|
|
{ QChar::Script_Yi, "Yi" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_OldItalic, "Old_Italic" },
|
2012-12-08 03:36:49 +00:00
|
|
|
{ QChar::Script_Gothic, "Gothic" },
|
|
|
|
{ QChar::Script_Deseret, "Deseret" },
|
|
|
|
{ QChar::Script_Tagalog, "Tagalog" },
|
|
|
|
{ QChar::Script_Hanunoo, "Hanunoo" },
|
|
|
|
{ QChar::Script_Buhid, "Buhid" },
|
|
|
|
{ QChar::Script_Tagbanwa, "Tagbanwa" },
|
|
|
|
{ QChar::Script_Coptic, "Coptic" },
|
|
|
|
// 4.0
|
|
|
|
{ QChar::Script_Limbu, "Limbu" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_TaiLe, "Tai_Le" },
|
|
|
|
{ QChar::Script_LinearB, "Linear_B" },
|
2012-12-08 03:36:49 +00:00
|
|
|
{ QChar::Script_Ugaritic, "Ugaritic" },
|
|
|
|
{ QChar::Script_Shavian, "Shavian" },
|
|
|
|
{ QChar::Script_Osmanya, "Osmanya" },
|
|
|
|
{ QChar::Script_Cypriot, "Cypriot" },
|
|
|
|
{ QChar::Script_Braille, "Braille" },
|
|
|
|
// 4.1
|
|
|
|
{ QChar::Script_Buginese, "Buginese" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_NewTaiLue, "New_Tai_Lue" },
|
2012-12-08 03:36:49 +00:00
|
|
|
{ QChar::Script_Glagolitic, "Glagolitic" },
|
|
|
|
{ QChar::Script_Tifinagh, "Tifinagh" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_SylotiNagri, "Syloti_Nagri" },
|
|
|
|
{ QChar::Script_OldPersian, "Old_Persian" },
|
2012-12-08 03:36:49 +00:00
|
|
|
{ QChar::Script_Kharoshthi, "Kharoshthi" },
|
|
|
|
// 5.0
|
|
|
|
{ QChar::Script_Balinese, "Balinese" },
|
|
|
|
{ QChar::Script_Cuneiform, "Cuneiform" },
|
|
|
|
{ QChar::Script_Phoenician, "Phoenician" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_PhagsPa, "Phags_Pa" },
|
2012-12-08 03:36:49 +00:00
|
|
|
{ QChar::Script_Nko, "Nko" },
|
|
|
|
// 5.1
|
|
|
|
{ QChar::Script_Sundanese, "Sundanese" },
|
|
|
|
{ QChar::Script_Lepcha, "Lepcha" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_OlChiki, "Ol_Chiki" },
|
2012-12-08 03:36:49 +00:00
|
|
|
{ QChar::Script_Vai, "Vai" },
|
|
|
|
{ QChar::Script_Saurashtra, "Saurashtra" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_KayahLi, "Kayah_Li" },
|
2012-12-08 03:36:49 +00:00
|
|
|
{ QChar::Script_Rejang, "Rejang" },
|
|
|
|
{ QChar::Script_Lycian, "Lycian" },
|
|
|
|
{ QChar::Script_Carian, "Carian" },
|
|
|
|
{ QChar::Script_Lydian, "Lydian" },
|
|
|
|
{ QChar::Script_Cham, "Cham" },
|
|
|
|
// 5.2
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_TaiTham, "Tai_Tham" },
|
|
|
|
{ QChar::Script_TaiViet, "Tai_Viet" },
|
2012-12-08 03:36:49 +00:00
|
|
|
{ QChar::Script_Avestan, "Avestan" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_EgyptianHieroglyphs, "Egyptian_Hieroglyphs" },
|
2012-12-08 03:36:49 +00:00
|
|
|
{ QChar::Script_Samaritan, "Samaritan" },
|
|
|
|
{ QChar::Script_Lisu, "Lisu" },
|
|
|
|
{ QChar::Script_Bamum, "Bamum" },
|
|
|
|
{ QChar::Script_Javanese, "Javanese" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_MeeteiMayek, "Meetei_Mayek" },
|
|
|
|
{ QChar::Script_ImperialAramaic, "Imperial_Aramaic" },
|
|
|
|
{ QChar::Script_OldSouthArabian, "Old_South_Arabian" },
|
|
|
|
{ QChar::Script_InscriptionalParthian, "Inscriptional_Parthian" },
|
|
|
|
{ QChar::Script_InscriptionalPahlavi, "Inscriptional_Pahlavi" },
|
|
|
|
{ QChar::Script_OldTurkic, "Old_Turkic" },
|
2012-12-08 03:36:49 +00:00
|
|
|
{ QChar::Script_Kaithi, "Kaithi" },
|
|
|
|
// 6.0
|
|
|
|
{ QChar::Script_Batak, "Batak" },
|
|
|
|
{ QChar::Script_Brahmi, "Brahmi" },
|
|
|
|
{ QChar::Script_Mandaic, "Mandaic" },
|
|
|
|
// 6.1
|
|
|
|
{ QChar::Script_Chakma, "Chakma" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_MeroiticCursive, "Meroitic_Cursive" },
|
|
|
|
{ QChar::Script_MeroiticHieroglyphs, "Meroitic_Hieroglyphs" },
|
2012-12-08 03:36:49 +00:00
|
|
|
{ QChar::Script_Miao, "Miao" },
|
|
|
|
{ QChar::Script_Sharada, "Sharada" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_SoraSompeng, "Sora_Sompeng" },
|
2012-12-08 03:36:49 +00:00
|
|
|
{ QChar::Script_Takri, "Takri" },
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
// 7.0
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_CaucasianAlbanian, "Caucasian_Albanian" },
|
|
|
|
{ QChar::Script_BassaVah, "Bassa_Vah" },
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
{ QChar::Script_Duployan, "Duployan" },
|
|
|
|
{ QChar::Script_Elbasan, "Elbasan" },
|
|
|
|
{ QChar::Script_Grantha, "Grantha" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_PahawhHmong, "Pahawh_Hmong" },
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
{ QChar::Script_Khojki, "Khojki" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_LinearA, "Linear_A" },
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
{ QChar::Script_Mahajani, "Mahajani" },
|
|
|
|
{ QChar::Script_Manichaean, "Manichaean" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_MendeKikakui, "Mende_Kikakui" },
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
{ QChar::Script_Modi, "Modi" },
|
|
|
|
{ QChar::Script_Mro, "Mro" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_OldNorthArabian, "Old_North_Arabian" },
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
{ QChar::Script_Nabataean, "Nabataean" },
|
|
|
|
{ QChar::Script_Palmyrene, "Palmyrene" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_PauCinHau, "Pau_Cin_Hau" },
|
|
|
|
{ QChar::Script_OldPermic, "Old_Permic" },
|
|
|
|
{ QChar::Script_PsalterPahlavi, "Psalter_Pahlavi" },
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
{ QChar::Script_Siddham, "Siddham" },
|
|
|
|
{ QChar::Script_Khudawadi, "Khudawadi" },
|
|
|
|
{ QChar::Script_Tirhuta, "Tirhuta" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_WarangCiti, "Warang_Citi" },
|
2015-11-02 04:28:14 +00:00
|
|
|
// 8.0
|
|
|
|
{ QChar::Script_Ahom, "Ahom" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_AnatolianHieroglyphs, "Anatolian_Hieroglyphs" },
|
2015-11-02 04:28:14 +00:00
|
|
|
{ QChar::Script_Hatran, "Hatran" },
|
|
|
|
{ QChar::Script_Multani, "Multani" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_OldHungarian, "Old_Hungarian" },
|
2015-11-02 04:28:14 +00:00
|
|
|
{ QChar::Script_SignWriting, "SignWriting" },
|
2017-12-12 08:47:53 +00:00
|
|
|
// 9.0
|
|
|
|
{ QChar::Script_Adlam, "Adlam" },
|
|
|
|
{ QChar::Script_Bhaiksuki, "Bhaiksuki" },
|
|
|
|
{ QChar::Script_Marchen, "Marchen" },
|
|
|
|
{ QChar::Script_Newa, "Newa" },
|
|
|
|
{ QChar::Script_Osage, "Osage" },
|
|
|
|
{ QChar::Script_Tangut, "Tangut" },
|
|
|
|
// 10.0
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_MasaramGondi, "Masaram_Gondi" },
|
2017-12-12 08:47:53 +00:00
|
|
|
{ QChar::Script_Nushu, "Nushu" },
|
|
|
|
{ QChar::Script_Soyombo, "Soyombo" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_ZanabazarSquare, "Zanabazar_Square" },
|
2019-10-23 15:17:49 +00:00
|
|
|
// 12.1
|
|
|
|
{ QChar::Script_Dogra, "Dogra" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_GunjalaGondi, "Gunjala_Gondi" },
|
|
|
|
{ QChar::Script_HanifiRohingya, "Hanifi_Rohingya" },
|
2019-10-23 15:17:49 +00:00
|
|
|
{ QChar::Script_Makasar, "Makasar" },
|
|
|
|
{ QChar::Script_Medefaidrin, "Medefaidrin" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_OldSogdian, "Old_Sogdian" },
|
2019-10-23 15:17:49 +00:00
|
|
|
{ QChar::Script_Sogdian, "Sogdian" },
|
|
|
|
{ QChar::Script_Elymaic, "Elymaic" },
|
|
|
|
{ QChar::Script_Nandinagari, "Nandinagari" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_NyiakengPuachueHmong, "Nyiakeng_Puachue_Hmong" },
|
2019-10-23 15:17:49 +00:00
|
|
|
{ QChar::Script_Wancho, "Wancho" },
|
2020-03-13 16:26:53 +00:00
|
|
|
// 13.0
|
|
|
|
{ QChar::Script_Chorasmian, "Chorasmian" },
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_DivesAkuru, "Dives_Akuru" },
|
|
|
|
{ QChar::Script_KhitanSmallScript, "Khitan_Small_Script" },
|
2020-03-13 16:26:53 +00:00
|
|
|
{ QChar::Script_Yezidi, "Yezidi" },
|
2017-12-12 08:47:53 +00:00
|
|
|
|
2021-10-04 13:06:52 +00:00
|
|
|
// 14.0
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_CyproMinoan, "Cypro_Minoan"},
|
|
|
|
{ QChar::Script_OldUyghur, "Old_Uyghur"},
|
2021-10-04 13:06:52 +00:00
|
|
|
{ QChar::Script_Tangsa, "Tangsa"},
|
|
|
|
{ QChar::Script_Toto, "Toto"},
|
|
|
|
{ QChar::Script_Vithkuqi, "Vithkuqi"},
|
|
|
|
|
2022-10-04 11:57:44 +00:00
|
|
|
// 15.0
|
|
|
|
{ QChar::Script_Kawi, "Kawi"},
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_NagMundari, "Nag_Mundari"},
|
2022-10-04 11:57:44 +00:00
|
|
|
|
2025-01-17 12:03:50 +00:00
|
|
|
// 16.0
|
|
|
|
{ QChar::Script_Garay, "Garay"},
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_GurungKhema, "Gurung_Khema"},
|
|
|
|
{ QChar::Script_KiratRai, "Kirat_Rai"},
|
|
|
|
{ QChar::Script_OlOnal, "Ol_Onal"},
|
2025-01-17 12:03:50 +00:00
|
|
|
{ QChar::Script_Sunuwar, "Sunuwar"},
|
|
|
|
{ QChar::Script_Todhri, "Todhri"},
|
util/unicode: remove replace('_', "") from readScripts()
For some reason, the code stored the official Unicode script tags
without their intervening underscores, removing underscores from the
input before attempting to match, which works, as long as Unicode
stays consistent in spelling properties "Like_This".
Relying on that is brittle, though, seeing as a tag without intervening
underscore (SignWriting) already slipped into the database, potentially
matching a sought Sign_Writing. It's highly unlikely that Unicode will
start to use property names that differ only by their use of underscore,
but why risk it, and why confuse readers of code by using a different
sought string, compared to what's in the files?
Fix by storing the tags unaltered and leaving the underscores in the
input alone, too.
Amends the start of the public history.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I5870a35812cb3fc0b28888cb09e9f42661684a26
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2025-08-27 16:10:37 +00:00
|
|
|
{ QChar::Script_TuluTigalari, "Tulu_Tigalari"},
|
2025-01-17 12:03:50 +00:00
|
|
|
|
2012-12-08 03:36:49 +00:00
|
|
|
// unhandled
|
|
|
|
{ QChar::Script_Unknown, 0 }
|
|
|
|
};
|
|
|
|
Scrpt *p = scripts;
|
|
|
|
while (p->name) {
|
|
|
|
scriptMap.insert(p->name, p->script);
|
|
|
|
++p;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-07-30 10:09:46 +00:00
|
|
|
// IDNA status as present int the data file
|
|
|
|
enum class IdnaRawStatus : unsigned int {
|
|
|
|
Disallowed,
|
|
|
|
Valid,
|
|
|
|
Ignored,
|
|
|
|
Mapped,
|
|
|
|
Deviation,
|
|
|
|
DisallowedStd3Valid,
|
|
|
|
DisallowedStd3Mapped,
|
|
|
|
};
|
|
|
|
|
|
|
|
static QHash<QByteArray, IdnaRawStatus> idnaStatusMap;
|
|
|
|
|
|
|
|
static void initIdnaStatusMap()
|
|
|
|
{
|
|
|
|
struct {
|
|
|
|
IdnaRawStatus status;
|
|
|
|
const char *name;
|
|
|
|
} data[] = {
|
|
|
|
{IdnaRawStatus::Disallowed, "disallowed"},
|
|
|
|
{IdnaRawStatus::Valid, "valid"},
|
|
|
|
{IdnaRawStatus::Ignored, "ignored"},
|
|
|
|
{IdnaRawStatus::Mapped, "mapped"},
|
|
|
|
{IdnaRawStatus::Deviation, "deviation"},
|
|
|
|
{IdnaRawStatus::DisallowedStd3Valid, "disallowed_STD3_valid"},
|
|
|
|
{IdnaRawStatus::DisallowedStd3Mapped, "disallowed_STD3_mapped"},
|
|
|
|
};
|
|
|
|
|
|
|
|
for (const auto &entry : data)
|
|
|
|
idnaStatusMap[entry.name] = entry.status;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char *idna_status_string =
|
|
|
|
"enum class IdnaStatus : unsigned int {\n"
|
|
|
|
" Disallowed,\n"
|
|
|
|
" Valid,\n"
|
|
|
|
" Ignored,\n"
|
|
|
|
" Mapped,\n"
|
|
|
|
" Deviation\n"
|
|
|
|
"};\n\n";
|
|
|
|
|
|
|
|
// Resolved IDNA status as it goes into the database.
|
|
|
|
// Qt extends host name validity rules to allow underscores
|
|
|
|
// NOTE: The members here should come in the same order and have the same values
|
|
|
|
// as in IdnaRawStatus
|
|
|
|
enum class IdnaStatus : unsigned int {
|
|
|
|
Disallowed,
|
|
|
|
Valid,
|
|
|
|
Ignored,
|
|
|
|
Mapped,
|
|
|
|
Deviation,
|
|
|
|
};
|
|
|
|
|
2024-03-19 07:59:18 +00:00
|
|
|
static const char *emoji_flags_string =
|
|
|
|
"enum class EmojiFlags : uchar {\n"
|
|
|
|
" NoEmoji = 0,\n"
|
|
|
|
" Emoji = 1,\n"
|
|
|
|
" Emoji_Presentation = 2,\n"
|
|
|
|
" Emoji_Modifier = 4,\n"
|
|
|
|
" Emoji_Modifier_Base = 8,\n"
|
|
|
|
" Emoji_Component = 16\n"
|
|
|
|
"};\n\n";
|
|
|
|
|
|
|
|
enum class EmojiFlags : uchar
|
|
|
|
{
|
|
|
|
NoEmoji = 0,
|
|
|
|
Emoji = 1,
|
|
|
|
Emoji_Presentation = 2,
|
|
|
|
Emoji_Modifier = 4,
|
|
|
|
Emoji_Modifier_Base = 8,
|
|
|
|
Emoji_Component = 16,
|
|
|
|
|
|
|
|
// Stored via grapheme break, so this is not added to emojiFlags property
|
|
|
|
Extended_Pictographic = 32,
|
|
|
|
};
|
|
|
|
|
|
|
|
static QHash<QByteArray, EmojiFlags> emojiFlagsMap;
|
|
|
|
|
|
|
|
static void initEmojiFlagsMap()
|
|
|
|
{
|
|
|
|
struct {
|
|
|
|
EmojiFlags flags;
|
|
|
|
const char *name;
|
|
|
|
} data[] = {
|
|
|
|
{EmojiFlags::Emoji, "Emoji"},
|
|
|
|
{EmojiFlags::Emoji_Presentation, "Emoji_Presentation"},
|
|
|
|
{EmojiFlags::Emoji_Modifier, "Emoji_Modifier"},
|
|
|
|
{EmojiFlags::Emoji_Modifier_Base, "Emoji_Modifier_Base"},
|
|
|
|
{EmojiFlags::Emoji_Component, "Emoji_Component"},
|
|
|
|
{EmojiFlags::Extended_Pictographic, "Extended_Pictographic"},
|
|
|
|
};
|
|
|
|
|
|
|
|
for (const auto &entry : data)
|
|
|
|
emojiFlagsMap[entry.name] = entry.flags;
|
|
|
|
}
|
|
|
|
|
2011-04-27 10:05:43 +00:00
|
|
|
// Keep this one in sync with the code in createPropertyInfo
|
|
|
|
static const char *property_string =
|
2019-09-03 18:53:31 +00:00
|
|
|
"enum Case {\n"
|
|
|
|
" LowerCase,\n"
|
|
|
|
" UpperCase,\n"
|
|
|
|
" TitleCase,\n"
|
|
|
|
" CaseFold,\n"
|
|
|
|
"\n"
|
|
|
|
" NumCases\n"
|
|
|
|
"};\n"
|
|
|
|
"\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
"struct Properties {\n"
|
2024-03-19 07:59:18 +00:00
|
|
|
" ushort category : 5;\n"
|
|
|
|
" ushort direction : 5;\n"
|
|
|
|
" ushort emojiFlags : 6; /* 5 used */\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
" ushort combiningClass : 8;\n"
|
2014-01-26 00:42:37 +00:00
|
|
|
" ushort joining : 3;\n"
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
" signed short digitValue : 5;\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
" signed short mirrorDiff : 16;\n"
|
2022-05-06 11:44:58 +00:00
|
|
|
" ushort unicodeVersion : 5; /* 5 used */\n"
|
|
|
|
" ushort eastAsianWidth : 3; /* 3 used */\n"
|
2019-09-03 18:53:31 +00:00
|
|
|
" ushort nfQuickCheck : 8;\n" // could be narrowed
|
2019-03-18 07:27:22 +00:00
|
|
|
"#ifdef Q_OS_WASM\n"
|
|
|
|
" unsigned char : 0; //wasm 64 packing trick\n"
|
|
|
|
"#endif\n"
|
2019-09-03 18:53:31 +00:00
|
|
|
" struct {\n"
|
|
|
|
" ushort special : 1;\n"
|
|
|
|
" signed short diff : 15;\n"
|
|
|
|
" } cases[NumCases];\n"
|
2019-03-18 07:27:22 +00:00
|
|
|
"#ifdef Q_OS_WASM\n"
|
|
|
|
" unsigned char : 0; //wasm 64 packing trick\n"
|
|
|
|
"#endif\n"
|
2017-12-12 09:14:28 +00:00
|
|
|
" ushort graphemeBreakClass : 5; /* 5 used */\n"
|
|
|
|
" ushort wordBreakClass : 5; /* 5 used */\n"
|
|
|
|
" ushort lineBreakClass : 6; /* 6 used */\n"
|
2021-07-30 10:09:46 +00:00
|
|
|
" ushort sentenceBreakClass : 4; /* 4 used */\n"
|
|
|
|
" ushort idnaStatus : 4; /* 3 used */\n"
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
" ushort script : 8;\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
"};\n\n"
|
2025-08-25 10:20:50 +00:00
|
|
|
"Q_DECL_CONST_FUNCTION\n"
|
2020-04-23 19:10:48 +00:00
|
|
|
"Q_CORE_EXPORT const Properties * QT_FASTCALL properties(char32_t ucs4) noexcept;\n"
|
2025-08-25 10:20:50 +00:00
|
|
|
"Q_DECL_CONST_FUNCTION\n"
|
2020-04-23 19:10:48 +00:00
|
|
|
"Q_CORE_EXPORT const Properties * QT_FASTCALL properties(char16_t ucs2) noexcept;\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
"\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
static const char *methods =
|
2020-04-23 19:10:48 +00:00
|
|
|
"Q_CORE_EXPORT GraphemeBreakClass QT_FASTCALL graphemeBreakClass(char32_t ucs4) noexcept;\n"
|
2019-04-09 11:33:15 +00:00
|
|
|
"inline GraphemeBreakClass graphemeBreakClass(QChar ch) noexcept\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
"{ return graphemeBreakClass(ch.unicode()); }\n"
|
2012-05-09 13:44:36 +00:00
|
|
|
"\n"
|
2020-04-23 19:10:48 +00:00
|
|
|
"Q_CORE_EXPORT WordBreakClass QT_FASTCALL wordBreakClass(char32_t ucs4) noexcept;\n"
|
2019-04-09 11:33:15 +00:00
|
|
|
"inline WordBreakClass wordBreakClass(QChar ch) noexcept\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
"{ return wordBreakClass(ch.unicode()); }\n"
|
2012-05-09 13:44:36 +00:00
|
|
|
"\n"
|
2020-04-23 19:10:48 +00:00
|
|
|
"Q_CORE_EXPORT SentenceBreakClass QT_FASTCALL sentenceBreakClass(char32_t ucs4) noexcept;\n"
|
2019-04-09 11:33:15 +00:00
|
|
|
"inline SentenceBreakClass sentenceBreakClass(QChar ch) noexcept\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
"{ return sentenceBreakClass(ch.unicode()); }\n"
|
2012-05-09 13:44:36 +00:00
|
|
|
"\n"
|
2020-04-23 19:10:48 +00:00
|
|
|
"Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(char32_t ucs4) noexcept;\n"
|
2019-04-09 11:33:15 +00:00
|
|
|
"inline LineBreakClass lineBreakClass(QChar ch) noexcept\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
"{ return lineBreakClass(ch.unicode()); }\n"
|
2021-07-30 10:09:46 +00:00
|
|
|
"\n"
|
|
|
|
"Q_CORE_EXPORT IdnaStatus QT_FASTCALL idnaStatus(char32_t ucs4) noexcept;\n"
|
|
|
|
"inline IdnaStatus idnaStatus(QChar ch) noexcept\n"
|
|
|
|
"{ return idnaStatus(ch.unicode()); }\n"
|
|
|
|
"\n"
|
2021-08-19 14:29:43 +00:00
|
|
|
"Q_CORE_EXPORT QStringView QT_FASTCALL idnaMapping(char32_t usc4) noexcept;\n"
|
|
|
|
"inline QStringView idnaMapping(QChar ch) noexcept\n"
|
2021-07-30 10:09:46 +00:00
|
|
|
"{ return idnaMapping(ch.unicode()); }\n"
|
2022-05-06 11:44:58 +00:00
|
|
|
"\n"
|
|
|
|
"Q_CORE_EXPORT EastAsianWidth QT_FASTCALL eastAsianWidth(char32_t ucs4) noexcept;\n"
|
|
|
|
"inline EastAsianWidth eastAsianWidth(QChar ch) noexcept\n"
|
|
|
|
"{ return eastAsianWidth(ch.unicode()); }\n"
|
2012-06-17 01:20:59 +00:00
|
|
|
"\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
static const int SizeOfPropertiesStruct = 20;
|
|
|
|
|
2019-09-03 21:40:56 +00:00
|
|
|
static const QByteArray sizeOfPropertiesStructCheck =
|
2020-06-11 09:35:19 +00:00
|
|
|
"static_assert(sizeof(Properties) == " + QByteArray::number(SizeOfPropertiesStruct) + ");\n\n";
|
2019-09-03 21:40:56 +00:00
|
|
|
|
2011-04-27 10:05:43 +00:00
|
|
|
struct PropertyFlags {
|
2020-08-05 11:38:02 +00:00
|
|
|
PropertyFlags()
|
|
|
|
: combiningClass(0)
|
|
|
|
, category(QChar::Other_NotAssigned) // Cn
|
|
|
|
, direction(QChar::DirL)
|
2024-03-19 07:59:18 +00:00
|
|
|
, emojiFlags(0)
|
2020-08-05 11:38:02 +00:00
|
|
|
, joining(QChar::Joining_None)
|
|
|
|
, age(QChar::Unicode_Unassigned)
|
|
|
|
, mirrorDiff(0) {}
|
|
|
|
|
2012-09-27 22:57:39 +00:00
|
|
|
bool operator==(const PropertyFlags &o) const {
|
2011-04-27 10:05:43 +00:00
|
|
|
return (combiningClass == o.combiningClass
|
|
|
|
&& category == o.category
|
|
|
|
&& direction == o.direction
|
2024-03-19 07:59:18 +00:00
|
|
|
&& emojiFlags == o.emojiFlags
|
2011-04-27 10:05:43 +00:00
|
|
|
&& joining == o.joining
|
|
|
|
&& age == o.age
|
2022-05-06 11:44:58 +00:00
|
|
|
&& eastAsianWidth == o.eastAsianWidth
|
2011-04-27 10:05:43 +00:00
|
|
|
&& digitValue == o.digitValue
|
|
|
|
&& mirrorDiff == o.mirrorDiff
|
|
|
|
&& lowerCaseDiff == o.lowerCaseDiff
|
|
|
|
&& upperCaseDiff == o.upperCaseDiff
|
|
|
|
&& titleCaseDiff == o.titleCaseDiff
|
|
|
|
&& caseFoldDiff == o.caseFoldDiff
|
|
|
|
&& lowerCaseSpecial == o.lowerCaseSpecial
|
|
|
|
&& upperCaseSpecial == o.upperCaseSpecial
|
|
|
|
&& titleCaseSpecial == o.titleCaseSpecial
|
|
|
|
&& caseFoldSpecial == o.caseFoldSpecial
|
2012-06-17 01:55:07 +00:00
|
|
|
&& graphemeBreakClass == o.graphemeBreakClass
|
|
|
|
&& wordBreakClass == o.wordBreakClass
|
|
|
|
&& sentenceBreakClass == o.sentenceBreakClass
|
|
|
|
&& lineBreakClass == o.lineBreakClass
|
2012-06-11 12:19:24 +00:00
|
|
|
&& script == o.script
|
2013-08-10 12:41:52 +00:00
|
|
|
&& nfQuickCheck == o.nfQuickCheck
|
2021-07-30 10:09:46 +00:00
|
|
|
&& idnaStatus == o.idnaStatus
|
2011-04-27 10:05:43 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
// from UnicodeData.txt
|
|
|
|
uchar combiningClass : 8;
|
2025-01-20 14:38:05 +00:00
|
|
|
uchar category : 5; // QChar::Category, but unsigned
|
|
|
|
uchar direction : 5; // QChar::Direction, but unsigned
|
2024-03-19 07:59:18 +00:00
|
|
|
// from emoji-data.txt
|
|
|
|
uchar emojiFlags : 5;
|
2011-04-27 10:05:43 +00:00
|
|
|
// from ArabicShaping.txt
|
2025-01-20 14:38:05 +00:00
|
|
|
uchar joining : 3; // QChar::JoiningType, but unsigned
|
2011-04-27 10:05:43 +00:00
|
|
|
// from DerivedAge.txt
|
2025-01-20 14:38:05 +00:00
|
|
|
uchar age : 5; // QChar::UnicodeVersion, but unsigned
|
2022-05-06 11:44:58 +00:00
|
|
|
// From EastAsianWidth.txt
|
|
|
|
EastAsianWidth eastAsianWidth = EastAsianWidth::N;
|
2020-08-05 11:38:02 +00:00
|
|
|
int digitValue = -1;
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
int mirrorDiff : 16;
|
|
|
|
|
2020-08-05 11:38:02 +00:00
|
|
|
int lowerCaseDiff = 0;
|
|
|
|
int upperCaseDiff = 0;
|
|
|
|
int titleCaseDiff = 0;
|
|
|
|
int caseFoldDiff = 0;
|
|
|
|
bool lowerCaseSpecial = 0;
|
|
|
|
bool upperCaseSpecial = 0;
|
|
|
|
bool titleCaseSpecial = 0;
|
|
|
|
bool caseFoldSpecial = 0;
|
|
|
|
GraphemeBreakClass graphemeBreakClass = GraphemeBreak_Any;
|
|
|
|
WordBreakClass wordBreakClass = WordBreak_Any;
|
|
|
|
SentenceBreakClass sentenceBreakClass = SentenceBreak_Any;
|
|
|
|
LineBreakClass lineBreakClass = LineBreak_AL;
|
|
|
|
int script = QChar::Script_Unknown;
|
2013-08-10 12:41:52 +00:00
|
|
|
// from DerivedNormalizationProps.txt
|
2020-08-05 11:38:02 +00:00
|
|
|
uchar nfQuickCheck = 0;
|
2021-07-30 10:09:46 +00:00
|
|
|
IdnaStatus idnaStatus = IdnaStatus::Disallowed;
|
2011-04-27 10:05:43 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
static QList<int> specialCaseMap;
|
|
|
|
|
2025-08-27 09:39:47 +00:00
|
|
|
static int appendToSpecialCaseMap(QSpan<const int> map)
|
2011-04-27 10:05:43 +00:00
|
|
|
{
|
|
|
|
QList<int> utf16map;
|
2025-08-27 09:39:47 +00:00
|
|
|
for (char32_t codepoint : map) {
|
2015-11-06 20:31:43 +00:00
|
|
|
// if the condition below doesn't hold anymore we need to modify our special case mapping code
|
|
|
|
Q_ASSERT(!QChar::requiresSurrogates(codepoint));
|
|
|
|
if (QChar::requiresSurrogates(codepoint)) {
|
|
|
|
utf16map << QChar::highSurrogate(codepoint);
|
|
|
|
utf16map << QChar::lowSurrogate(codepoint);
|
2011-04-27 10:05:43 +00:00
|
|
|
} else {
|
2015-11-06 20:31:43 +00:00
|
|
|
utf16map << codepoint;
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
}
|
2011-10-18 17:12:20 +00:00
|
|
|
int length = utf16map.size();
|
|
|
|
utf16map.prepend(length);
|
|
|
|
|
2011-10-18 17:12:21 +00:00
|
|
|
if (specialCaseMap.isEmpty())
|
|
|
|
specialCaseMap << 0; // placeholder
|
|
|
|
|
|
|
|
int i = 1;
|
2011-10-18 17:12:20 +00:00
|
|
|
while (i < specialCaseMap.size()) {
|
|
|
|
int n = specialCaseMap.at(i);
|
|
|
|
if (n == length) {
|
|
|
|
int j;
|
|
|
|
for (j = 1; j <= n; ++j) {
|
|
|
|
if (specialCaseMap.at(i+j) != utf16map.at(j))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (j > n)
|
|
|
|
return i;
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2011-10-18 17:12:20 +00:00
|
|
|
i += n + 1;
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int pos = specialCaseMap.size();
|
|
|
|
specialCaseMap << utf16map;
|
|
|
|
return pos;
|
|
|
|
}
|
|
|
|
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
// DerivedCoreProperties.txt
|
2014-01-12 19:14:25 +00:00
|
|
|
static inline bool isDefaultIgnorable(uint ucs4)
|
|
|
|
{
|
|
|
|
// Default_Ignorable_Code_Point:
|
|
|
|
// Generated from
|
|
|
|
// Other_Default_Ignorable_Code_Point + Cf + Variation_Selector
|
|
|
|
// - White_Space - FFF9..FFFB (Annotation Characters)
|
|
|
|
// - 0600..0604, 06DD, 070F, 110BD (exceptional Cf characters that should be visible)
|
|
|
|
if (ucs4 <= 0xff)
|
|
|
|
return ucs4 == 0xad;
|
|
|
|
|
|
|
|
return ucs4 == 0x034f
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
|| ucs4 == 0x061c
|
2014-01-12 19:14:25 +00:00
|
|
|
|| (ucs4 >= 0x115f && ucs4 <= 0x1160)
|
|
|
|
|| (ucs4 >= 0x17b4 && ucs4 <= 0x17b5)
|
|
|
|
|| (ucs4 >= 0x180b && ucs4 <= 0x180d)
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
|| ucs4 == 0x180e
|
2014-01-12 19:14:25 +00:00
|
|
|
|| (ucs4 >= 0x200b && ucs4 <= 0x200f)
|
|
|
|
|| (ucs4 >= 0x202a && ucs4 <= 0x202e)
|
|
|
|
|| (ucs4 >= 0x2060 && ucs4 <= 0x206f)
|
|
|
|
|| ucs4 == 0x3164
|
|
|
|
|| (ucs4 >= 0xfe00 && ucs4 <= 0xfe0f)
|
|
|
|
|| ucs4 == 0xfeff
|
|
|
|
|| ucs4 == 0xffa0
|
|
|
|
|| (ucs4 >= 0xfff0 && ucs4 <= 0xfff8)
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
|| (ucs4 >= 0x1bca0 && ucs4 <= 0x1bca3)
|
|
|
|
|| (ucs4 >= 0x1d173 && ucs4 <= 0x1d17a)
|
|
|
|
|| (ucs4 >= 0xe0000 && ucs4 <= 0xe0fff);
|
2014-01-12 19:14:25 +00:00
|
|
|
}
|
|
|
|
|
2011-04-27 10:05:43 +00:00
|
|
|
struct UnicodeData {
|
|
|
|
UnicodeData(int codepoint = 0) {
|
|
|
|
p.direction = QChar::DirL;
|
|
|
|
// DerivedBidiClass.txt
|
2012-06-03 01:17:10 +00:00
|
|
|
// The unassigned code points that default to AL are in the ranges:
|
|
|
|
// [U+0600..U+07BF, U+08A0..U+08FF, U+FB50..U+FDCF, U+FDF0..U+FDFF, U+FE70..U+FEFF, U+1EE00..U+1EEFF]
|
|
|
|
if ((codepoint >= 0x0600 && codepoint <= 0x07BF)
|
|
|
|
|| (codepoint >= 0x08A0 && codepoint <= 0x08FF)
|
|
|
|
|| (codepoint >= 0xFB50 && codepoint <= 0xFDCF)
|
|
|
|
|| (codepoint >= 0xFDF0 && codepoint <= 0xFDFF)
|
|
|
|
|| (codepoint >= 0xFE70 && codepoint <= 0xFEFF)
|
|
|
|
|| (codepoint >= 0x1EE00 && codepoint <= 0x1EEFF)) {
|
|
|
|
p.direction = QChar::DirAL;
|
|
|
|
}
|
|
|
|
// The unassigned code points that default to R are in the ranges:
|
|
|
|
// [U+0590..U+05FF, U+07C0..U+089F, U+FB1D..U+FB4F, U+10800..U+10FFF, U+1E800..U+1EDFF, U+1EF00..U+1EFFF]
|
|
|
|
else if ((codepoint >= 0x0590 && codepoint <= 0x05FF)
|
|
|
|
|| (codepoint >= 0x07C0 && codepoint <= 0x089F)
|
|
|
|
|| (codepoint >= 0xFB1D && codepoint <= 0xFB4F)
|
|
|
|
|| (codepoint >= 0x10800 && codepoint <= 0x10FFF)
|
|
|
|
|| (codepoint >= 0x1E800 && codepoint <= 0x1EDFF)
|
|
|
|
|| (codepoint >= 0x1EF00 && codepoint <= 0x1EFFF)) {
|
2011-04-27 10:05:43 +00:00
|
|
|
p.direction = QChar::DirR;
|
|
|
|
}
|
2014-01-12 19:14:25 +00:00
|
|
|
// The unassigned code points that default to ET are in the range:
|
|
|
|
// [U+20A0..U+20CF]
|
|
|
|
else if (codepoint >= 0x20A0 && codepoint <= 0x20CF) {
|
|
|
|
p.direction = QChar::DirET;
|
|
|
|
}
|
|
|
|
// The unassigned code points that default to BN have one of the following properties:
|
|
|
|
// Default_Ignorable_Code_Point
|
|
|
|
// Noncharacter_Code_Point
|
|
|
|
else if (QChar::isNonCharacter(codepoint) || isDefaultIgnorable(codepoint)) {
|
|
|
|
p.direction = QChar::DirBN;
|
|
|
|
}
|
2012-06-03 01:17:10 +00:00
|
|
|
|
2012-06-17 01:55:07 +00:00
|
|
|
p.lineBreakClass = LineBreak_AL; // XX -> AL
|
2012-06-03 01:17:10 +00:00
|
|
|
// LineBreak.txt
|
|
|
|
// The unassigned code points that default to "ID" include ranges in the following blocks:
|
2015-11-02 04:28:14 +00:00
|
|
|
// [U+3400..U+4DBF, U+4E00..U+9FFF, U+F900..U+FAFF, U+20000..U+2A6DF, U+2A700..U+2B73F, U+2B740..U+2B81F, U+2B820..U+2CEAF, U+2F800..U+2FA1F]
|
|
|
|
// and any other reserved code points on
|
|
|
|
// [U+20000..U+2FFFD, U+30000..U+3FFFD]
|
2025-01-17 12:03:50 +00:00
|
|
|
// and some unassigned ranges in Plane 1:
|
|
|
|
// [1F000..1F7FF, 1F900..1FAFF, 1FC00..1FFFD]
|
2012-06-03 01:17:10 +00:00
|
|
|
if ((codepoint >= 0x3400 && codepoint <= 0x4DBF)
|
|
|
|
|| (codepoint >= 0x4E00 && codepoint <= 0x9FFF)
|
|
|
|
|| (codepoint >= 0xF900 && codepoint <= 0xFAFF)
|
2025-01-17 12:03:50 +00:00
|
|
|
|| (codepoint >= 0x1F000 && codepoint <= 0x1F7FF)
|
|
|
|
|| (codepoint >= 0x1F900 && codepoint <= 0x1FAFF)
|
|
|
|
|| (codepoint >= 0x1FC00 && codepoint <= 0x1FFFD)
|
2012-06-03 01:17:10 +00:00
|
|
|
|| (codepoint >= 0x20000 && codepoint <= 0x2A6DF)
|
|
|
|
|| (codepoint >= 0x2A700 && codepoint <= 0x2B73F)
|
|
|
|
|| (codepoint >= 0x2B740 && codepoint <= 0x2B81F)
|
2015-11-02 04:28:14 +00:00
|
|
|
|| (codepoint >= 0x2B820 && codepoint <= 0x2CEAF)
|
2012-06-03 01:17:10 +00:00
|
|
|
|| (codepoint >= 0x2F800 && codepoint <= 0x2FA1F)
|
|
|
|
|| (codepoint >= 0x20000 && codepoint <= 0x2FFFD)
|
|
|
|
|| (codepoint >= 0x30000 && codepoint <= 0x3FFFD)) {
|
2012-06-17 01:55:07 +00:00
|
|
|
p.lineBreakClass = LineBreak_ID;
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2014-01-12 19:14:25 +00:00
|
|
|
// The unassigned code points that default to "PR" comprise a range in the following block:
|
|
|
|
// [U+20A0..U+20CF]
|
|
|
|
else if (codepoint >= 0x20A0 && codepoint <= 0x20CF) {
|
|
|
|
p.lineBreakClass = LineBreak_PR;
|
|
|
|
}
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2012-05-08 01:43:16 +00:00
|
|
|
|
|
|
|
static UnicodeData &valueRef(int codepoint);
|
|
|
|
|
2011-04-27 10:05:43 +00:00
|
|
|
PropertyFlags p;
|
|
|
|
|
|
|
|
// from UnicodeData.txt
|
2020-08-05 11:38:02 +00:00
|
|
|
QChar::Decomposition decompositionType = QChar::NoDecomposition;
|
2011-04-27 10:05:43 +00:00
|
|
|
QList<int> decomposition;
|
|
|
|
|
|
|
|
QList<int> specialFolding;
|
|
|
|
|
|
|
|
// from BidiMirroring.txt
|
2020-08-05 11:38:02 +00:00
|
|
|
int mirroredChar = 0;
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
// DerivedNormalizationProps.txt
|
2020-08-05 11:38:02 +00:00
|
|
|
bool excludedComposition = false;
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
// computed position of unicode property set
|
2020-08-05 11:38:02 +00:00
|
|
|
int propertyIndex = -1;
|
2021-07-30 10:09:46 +00:00
|
|
|
|
|
|
|
IdnaRawStatus idnaRawStatus = IdnaRawStatus::Disallowed;
|
2011-04-27 10:05:43 +00:00
|
|
|
};
|
|
|
|
|
2012-05-08 01:43:16 +00:00
|
|
|
static QList<UnicodeData> unicodeData;
|
|
|
|
|
|
|
|
UnicodeData &UnicodeData::valueRef(int codepoint)
|
|
|
|
{
|
|
|
|
static bool initialized = false;
|
|
|
|
if (!initialized) {
|
2012-05-15 17:48:20 +00:00
|
|
|
unicodeData.reserve(QChar::LastValidCodePoint + 1);
|
|
|
|
for (int uc = 0; uc <= QChar::LastValidCodePoint; ++uc)
|
2012-05-08 01:43:16 +00:00
|
|
|
unicodeData.append(UnicodeData(uc));
|
|
|
|
initialized = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
Q_ASSERT(codepoint <= 0x10ffff);
|
|
|
|
return unicodeData[codepoint];
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-27 10:05:43 +00:00
|
|
|
static QHash<int, int> decompositionLength;
|
|
|
|
static int highestComposedCharacter = 0;
|
|
|
|
static int numLigatures = 0;
|
|
|
|
static int highestLigature = 0;
|
|
|
|
|
|
|
|
struct Ligature {
|
2012-04-26 16:29:08 +00:00
|
|
|
int u1;
|
|
|
|
int u2;
|
|
|
|
int ligature;
|
2011-04-27 10:05:43 +00:00
|
|
|
};
|
|
|
|
// we need them sorted after the first component for fast lookup
|
|
|
|
bool operator < (const Ligature &l1, const Ligature &l2)
|
|
|
|
{ return l1.u1 < l2.u1; }
|
|
|
|
|
2012-04-26 16:29:08 +00:00
|
|
|
static QHash<int, QList<Ligature> > ligatureHashes;
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
static QHash<int, int> combiningClassUsage;
|
|
|
|
|
|
|
|
static int maxLowerCaseDiff = 0;
|
|
|
|
static int maxUpperCaseDiff = 0;
|
|
|
|
static int maxTitleCaseDiff = 0;
|
2025-08-15 17:49:56 +00:00
|
|
|
static int maxSeparatorCodepoint = 0;
|
2011-04-27 10:05:43 +00:00
|
|
|
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
template <typename LineConsumer>
|
|
|
|
void readUnicodeFile(const char *fileName, LineConsumer yield)
|
|
|
|
{
|
|
|
|
qDebug("Reading %s", fileName);
|
|
|
|
|
|
|
|
QFile f("data/"_L1 % QLatin1StringView{fileName});
|
|
|
|
if (!f.open(QFile::ReadOnly))
|
|
|
|
qFatal("Couln't open %s: %ls", fileName, qUtf16Printable(f.errorString()));
|
|
|
|
|
|
|
|
int lineNo = 0;
|
|
|
|
QByteArray line;
|
|
|
|
while (f.readLineInto(&line)) {
|
|
|
|
++lineNo;
|
|
|
|
const auto comment = line.indexOf('#');
|
|
|
|
if (comment >= 0)
|
|
|
|
line.truncate(comment);
|
|
|
|
line = std::move(line).trimmed();
|
|
|
|
if (!line.isEmpty())
|
|
|
|
yield(line, lineNo);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-08-27 17:04:10 +00:00
|
|
|
static int parseHex(QByteArrayView input, int lineNo)
|
|
|
|
{
|
|
|
|
bool ok;
|
|
|
|
const int result = input.trimmed().toUInt(&ok, 16); // uint to reject negative values
|
|
|
|
if (!ok) {
|
|
|
|
qFatal("Failed to parse \"%.*s\" as an unsigned hex number in line %d.",
|
|
|
|
int(input.size()), input.data(), lineNo);
|
|
|
|
}
|
|
|
|
if (result > QChar::LastValidCodePoint) {
|
|
|
|
qFatal("Code point U+%05x is larger than allowed by Unicode in line %d.",
|
|
|
|
result, lineNo);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2025-08-27 10:09:12 +00:00
|
|
|
template <typename Sep = char16_t>
|
|
|
|
QVarLengthArray<int, 4> parseHexList(QByteArrayView input, int lineNo, Sep sep = u' ')
|
2025-08-27 09:36:31 +00:00
|
|
|
{
|
|
|
|
QVarLengthArray<int, 4> result;
|
2025-08-27 10:09:12 +00:00
|
|
|
const auto sb = sep == u' ' ? Qt::SkipEmptyParts : Qt::KeepEmptyParts;
|
2025-08-27 09:36:31 +00:00
|
|
|
for (auto e : qTokenize(QLatin1StringView{input}, sep, sb))
|
|
|
|
result.push_back(parseHex(e, lineNo));
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2025-08-27 10:09:12 +00:00
|
|
|
static auto parseHexRange(QByteArrayView input, int lineNo)
|
|
|
|
{
|
|
|
|
struct R { int from, to; };
|
|
|
|
|
|
|
|
const auto pair = parseHexList(input, lineNo, ".."_L1);
|
|
|
|
Q_ASSERT(pair.size() <= 2);
|
|
|
|
int from = pair[0];
|
|
|
|
int to = from;
|
|
|
|
if (pair.size() == 2) {
|
|
|
|
to = pair[1];
|
|
|
|
if (from > to)
|
|
|
|
qFatal("invalid range in line %d: %05x > %05x", lineNo, from, to);
|
|
|
|
}
|
|
|
|
return R{from, to};
|
|
|
|
}
|
|
|
|
|
2011-04-27 10:05:43 +00:00
|
|
|
static void readUnicodeData()
|
|
|
|
{
|
2012-06-17 01:20:59 +00:00
|
|
|
qDebug("Reading UnicodeData.txt");
|
|
|
|
|
|
|
|
enum UniDataFields {
|
|
|
|
UD_Value,
|
|
|
|
UD_Name,
|
|
|
|
UD_Category,
|
|
|
|
UD_CombiningClass,
|
|
|
|
UD_BidiCategory,
|
|
|
|
UD_Decomposition,
|
|
|
|
UD_DecimalDigitValue,
|
|
|
|
UD_DigitValue,
|
|
|
|
UD_NumericValue,
|
|
|
|
UD_Mirrored,
|
|
|
|
UD_OldName,
|
|
|
|
UD_Comment,
|
|
|
|
UD_UpperCase,
|
|
|
|
UD_LowerCase,
|
|
|
|
UD_TitleCase
|
|
|
|
};
|
|
|
|
|
2011-04-27 10:05:43 +00:00
|
|
|
QFile f("data/UnicodeData.txt");
|
2025-01-17 12:13:12 +00:00
|
|
|
if (!f.open(QFile::ReadOnly))
|
|
|
|
qFatal() << "Couldn't open UnicodeData.txt:" << f.errorString();
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2025-08-27 17:04:10 +00:00
|
|
|
int lineNo = 0;
|
2011-04-27 10:05:43 +00:00
|
|
|
while (!f.atEnd()) {
|
2025-08-27 17:04:10 +00:00
|
|
|
++lineNo;
|
2011-04-27 10:05:43 +00:00
|
|
|
QByteArray line;
|
|
|
|
line.resize(1024);
|
|
|
|
int len = f.readLine(line.data(), 1024);
|
|
|
|
line.truncate(len-1);
|
|
|
|
|
|
|
|
int comment = line.indexOf('#');
|
|
|
|
if (comment >= 0)
|
|
|
|
line = line.left(comment);
|
|
|
|
if (line.isEmpty())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
QList<QByteArray> properties = line.split(';');
|
2025-08-27 17:04:10 +00:00
|
|
|
const int codepoint = parseHex(properties[UD_Value], lineNo);
|
2012-05-15 17:48:20 +00:00
|
|
|
Q_ASSERT(codepoint <= QChar::LastValidCodePoint);
|
2011-04-27 10:05:43 +00:00
|
|
|
int lastCodepoint = codepoint;
|
|
|
|
|
|
|
|
QByteArray name = properties[UD_Name];
|
|
|
|
if (name.startsWith('<') && name.contains("First")) {
|
|
|
|
QByteArray nextLine;
|
|
|
|
nextLine.resize(1024);
|
|
|
|
f.readLine(nextLine.data(), 1024);
|
2025-08-27 17:04:10 +00:00
|
|
|
++lineNo;
|
2011-04-27 10:05:43 +00:00
|
|
|
QList<QByteArray> properties = nextLine.split(';');
|
|
|
|
Q_ASSERT(properties[UD_Name].startsWith('<') && properties[UD_Name].contains("Last"));
|
2025-08-27 17:04:10 +00:00
|
|
|
lastCodepoint = parseHex(properties[UD_Value], lineNo);
|
2012-05-15 17:48:20 +00:00
|
|
|
Q_ASSERT(lastCodepoint <= QChar::LastValidCodePoint);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
|
2012-05-08 01:43:16 +00:00
|
|
|
UnicodeData &data = UnicodeData::valueRef(codepoint);
|
2011-07-08 14:24:57 +00:00
|
|
|
data.p.category = categoryMap.value(properties[UD_Category], QChar::Other_NotAssigned);
|
2025-08-15 17:49:56 +00:00
|
|
|
if (data.p.category == QChar::Separator_Space || data.p.category == QChar::Separator_Line
|
|
|
|
|| data.p.category == QChar::Separator_Paragraph)
|
|
|
|
maxSeparatorCodepoint = codepoint;
|
2011-04-27 10:05:43 +00:00
|
|
|
data.p.combiningClass = properties[UD_CombiningClass].toInt();
|
|
|
|
if (!combiningClassUsage.contains(data.p.combiningClass))
|
|
|
|
combiningClassUsage[data.p.combiningClass] = 1;
|
|
|
|
else
|
|
|
|
++combiningClassUsage[data.p.combiningClass];
|
|
|
|
|
2014-01-12 19:14:25 +00:00
|
|
|
Direction dir = directionMap.value(properties[UD_BidiCategory], Dir_Unassigned);
|
|
|
|
if (dir == Dir_Unassigned)
|
|
|
|
qFatal("unhandled direction value: %s", properties[UD_BidiCategory].constData());
|
|
|
|
data.p.direction = QChar::Direction(dir);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
if (!properties[UD_UpperCase].isEmpty()) {
|
2025-08-27 17:04:10 +00:00
|
|
|
const int upperCase = parseHex(properties[UD_UpperCase], lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
int diff = upperCase - codepoint;
|
2015-11-06 20:31:43 +00:00
|
|
|
// if the conditions below doesn't hold anymore we need to modify our upper casing code
|
|
|
|
Q_ASSERT(QChar::requiresSurrogates(codepoint) == QChar::requiresSurrogates(upperCase));
|
|
|
|
if (QChar::requiresSurrogates(codepoint)) {
|
|
|
|
Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(upperCase));
|
|
|
|
Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(upperCase));
|
|
|
|
}
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
if (qAbs(diff) >= (1<<13)) {
|
2012-04-23 15:11:57 +00:00
|
|
|
data.p.upperCaseSpecial = true;
|
2025-08-27 09:39:47 +00:00
|
|
|
data.p.upperCaseDiff = appendToSpecialCaseMap({upperCase});
|
2012-04-23 15:11:57 +00:00
|
|
|
} else {
|
|
|
|
data.p.upperCaseDiff = diff;
|
|
|
|
maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(diff));
|
|
|
|
}
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
if (!properties[UD_LowerCase].isEmpty()) {
|
2025-08-27 17:04:10 +00:00
|
|
|
const int lowerCase = parseHex(properties[UD_LowerCase], lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
int diff = lowerCase - codepoint;
|
2015-11-06 20:31:43 +00:00
|
|
|
// if the conditions below doesn't hold anymore we need to modify our lower casing code
|
|
|
|
Q_ASSERT(QChar::requiresSurrogates(codepoint) == QChar::requiresSurrogates(lowerCase));
|
|
|
|
if (QChar::requiresSurrogates(codepoint)) {
|
|
|
|
Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(lowerCase));
|
|
|
|
Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(lowerCase));
|
|
|
|
}
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
if (qAbs(diff) >= (1<<13)) {
|
2012-04-23 15:11:57 +00:00
|
|
|
data.p.lowerCaseSpecial = true;
|
2025-08-27 09:39:47 +00:00
|
|
|
data.p.lowerCaseDiff = appendToSpecialCaseMap({lowerCase});
|
2012-04-23 15:11:57 +00:00
|
|
|
} else {
|
|
|
|
data.p.lowerCaseDiff = diff;
|
|
|
|
maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(diff));
|
|
|
|
}
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
// we want toTitleCase to map to ToUpper in case we don't have any titlecase.
|
|
|
|
if (properties[UD_TitleCase].isEmpty())
|
|
|
|
properties[UD_TitleCase] = properties[UD_UpperCase];
|
|
|
|
if (!properties[UD_TitleCase].isEmpty()) {
|
2025-08-27 17:04:10 +00:00
|
|
|
const int titleCase = parseHex(properties[UD_TitleCase], lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
int diff = titleCase - codepoint;
|
2015-11-06 20:31:43 +00:00
|
|
|
// if the conditions below doesn't hold anymore we need to modify our title casing code
|
|
|
|
Q_ASSERT(QChar::requiresSurrogates(codepoint) == QChar::requiresSurrogates(titleCase));
|
|
|
|
if (QChar::requiresSurrogates(codepoint)) {
|
|
|
|
Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(titleCase));
|
|
|
|
Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(titleCase));
|
|
|
|
}
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
if (qAbs(diff) >= (1<<13)) {
|
2012-04-23 15:11:57 +00:00
|
|
|
data.p.titleCaseSpecial = true;
|
2025-08-27 09:39:47 +00:00
|
|
|
data.p.titleCaseDiff = appendToSpecialCaseMap({titleCase});
|
2012-04-23 15:11:57 +00:00
|
|
|
} else {
|
|
|
|
data.p.titleCaseDiff = diff;
|
|
|
|
maxTitleCaseDiff = qMax(maxTitleCaseDiff, qAbs(diff));
|
|
|
|
}
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!properties[UD_DigitValue].isEmpty())
|
|
|
|
data.p.digitValue = properties[UD_DigitValue].toInt();
|
|
|
|
|
|
|
|
// decompositition
|
|
|
|
QByteArray decomposition = properties[UD_Decomposition];
|
|
|
|
if (!decomposition.isEmpty()) {
|
|
|
|
highestComposedCharacter = qMax(highestComposedCharacter, codepoint);
|
|
|
|
QList<QByteArray> d = decomposition.split(' ');
|
|
|
|
if (d[0].contains('<')) {
|
|
|
|
data.decompositionType = decompositionMap.value(d[0], QChar::NoDecomposition);
|
|
|
|
if (data.decompositionType == QChar::NoDecomposition)
|
2014-01-26 00:42:37 +00:00
|
|
|
qFatal("unhandled decomposition type: %s", d[0].constData());
|
2011-04-27 10:05:43 +00:00
|
|
|
d.takeFirst();
|
|
|
|
} else {
|
|
|
|
data.decompositionType = QChar::Canonical;
|
|
|
|
}
|
2025-08-27 17:04:10 +00:00
|
|
|
for (qsizetype i = 0; i < d.size(); ++i)
|
|
|
|
data.decomposition.append(parseHex(d[i], lineNo));
|
2012-04-23 03:00:16 +00:00
|
|
|
++decompositionLength[data.decomposition.size()];
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = codepoint; i <= lastCodepoint; ++i)
|
2012-05-08 01:43:16 +00:00
|
|
|
unicodeData[i] = data;
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int maxMirroredDiff = 0;
|
|
|
|
|
|
|
|
static void readBidiMirroring()
|
|
|
|
{
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
readUnicodeFile("BidiMirroring.txt",
|
2025-08-27 17:04:10 +00:00
|
|
|
[] (const QByteArray &line, int lineNo) {
|
2011-04-27 10:05:43 +00:00
|
|
|
QList<QByteArray> pair = line.split(';');
|
|
|
|
Q_ASSERT(pair.size() == 2);
|
|
|
|
|
2025-08-27 17:04:10 +00:00
|
|
|
const int codepoint = parseHex(pair[0], lineNo);
|
|
|
|
const int mirror = parseHex(pair[1], lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2025-08-27 11:40:56 +00:00
|
|
|
if (QChar::requiresSurrogates(codepoint) || QChar::requiresSurrogates(mirror)) {
|
|
|
|
qFatal("QTextEngine assumes that no mirrored pairs exist beyond the BMP, "
|
|
|
|
"but U+%05x and U+%05x (line %d) do. Fix the implementation.",
|
|
|
|
codepoint, mirror, lineNo);
|
|
|
|
}
|
|
|
|
|
2012-05-08 01:43:16 +00:00
|
|
|
UnicodeData &d = UnicodeData::valueRef(codepoint);
|
2011-04-27 10:05:43 +00:00
|
|
|
d.mirroredChar = mirror;
|
|
|
|
d.p.mirrorDiff = d.mirroredChar - codepoint;
|
|
|
|
maxMirroredDiff = qMax(maxMirroredDiff, qAbs(d.p.mirrorDiff));
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
});
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void readArabicShaping()
|
|
|
|
{
|
2017-12-11 10:28:11 +00:00
|
|
|
// Initialize defaults:
|
|
|
|
// Code points that are not explicitly listed in ArabicShaping.txt are either of joining type T or U:
|
|
|
|
// - Those that not explicitly listed that are of General Category Mn, Me, or Cf have joining type T.
|
|
|
|
// - All others not explicitly listed have joining type U.
|
|
|
|
for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
|
|
|
|
UnicodeData &d = UnicodeData::valueRef(codepoint);
|
|
|
|
if (d.p.joining == QChar::Joining_None) {
|
|
|
|
if (d.p.category == QChar::Mark_NonSpacing || d.p.category == QChar::Mark_Enclosing || d.p.category == QChar::Other_Format)
|
|
|
|
d.p.joining = QChar::Joining_Transparent;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
readUnicodeFile("ArabicShaping.txt",
|
|
|
|
[] (const QByteArray &line, int lineNo) {
|
2011-04-27 10:05:43 +00:00
|
|
|
QList<QByteArray> l = line.split(';');
|
|
|
|
Q_ASSERT(l.size() == 4);
|
|
|
|
|
2025-08-27 17:04:10 +00:00
|
|
|
const int codepoint = parseHex(l[0], lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2014-01-26 00:42:37 +00:00
|
|
|
UnicodeData &d = UnicodeData::valueRef(codepoint);
|
|
|
|
JoiningType joining = joining_map.value(l[2].trimmed(), Joining_Unassigned);
|
|
|
|
switch (joining) {
|
|
|
|
case Joining_Unassigned:
|
|
|
|
qFatal("%x: unassigned or unhandled joining type: %s", codepoint, l[2].constData());
|
|
|
|
break;
|
|
|
|
case Joining_Transparent:
|
2019-10-23 15:17:49 +00:00
|
|
|
switch (d.p.category) {
|
|
|
|
case QChar::Mark_Enclosing:
|
|
|
|
case QChar::Mark_NonSpacing:
|
|
|
|
case QChar::Letter_Modifier:
|
|
|
|
case QChar::Other_Format:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
qFatal("%x: joining type '%s' was met (category: %d); "
|
|
|
|
"the current implementation needs to be revised!",
|
|
|
|
codepoint, l[2].constData(), d.p.category);
|
2014-01-26 00:42:37 +00:00
|
|
|
}
|
2019-10-23 15:17:49 +00:00
|
|
|
Q_FALLTHROUGH();
|
2014-01-26 00:42:37 +00:00
|
|
|
default:
|
|
|
|
d.p.joining = QChar::JoiningType(joining);
|
|
|
|
break;
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
});
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void readDerivedAge()
|
|
|
|
{
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
readUnicodeFile("DerivedAge.txt",
|
2025-08-27 14:56:40 +00:00
|
|
|
[] (const QByteArray &line, int lineNo) {
|
2011-04-27 10:05:43 +00:00
|
|
|
QList<QByteArray> l = line.split(';');
|
|
|
|
Q_ASSERT(l.size() == 2);
|
|
|
|
|
2025-08-27 10:09:12 +00:00
|
|
|
const auto [from, to] = parseHexRange(l[0], lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
QChar::UnicodeVersion age = age_map.value(l[1].trimmed(), QChar::Unicode_Unassigned);
|
2020-07-31 13:36:44 +00:00
|
|
|
//qDebug() << Qt::hex << from << ".." << to << ba << age;
|
2011-04-27 10:05:43 +00:00
|
|
|
if (age == QChar::Unicode_Unassigned)
|
|
|
|
qFatal("unassigned or unhandled age value: %s", l[1].constData());
|
|
|
|
|
|
|
|
for (int codepoint = from; codepoint <= to; ++codepoint) {
|
2012-05-08 01:43:16 +00:00
|
|
|
UnicodeData &d = UnicodeData::valueRef(codepoint);
|
2011-04-27 10:05:43 +00:00
|
|
|
d.p.age = age;
|
|
|
|
}
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
});
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
|
2022-05-06 11:44:58 +00:00
|
|
|
static void readEastAsianWidth()
|
|
|
|
{
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
readUnicodeFile("EastAsianWidth.txt",
|
|
|
|
[] (QByteArray &line, int lineNo) {
|
|
|
|
line = std::move(line).simplified();
|
2022-05-06 11:44:58 +00:00
|
|
|
|
|
|
|
QList<QByteArray> fields = line.split(';');
|
|
|
|
Q_ASSERT(fields.size() == 2);
|
|
|
|
|
2025-08-27 10:09:12 +00:00
|
|
|
const auto [first, last] = parseHexRange(fields[0], lineNo);
|
2022-05-06 11:44:58 +00:00
|
|
|
|
|
|
|
const QByteArray widthString = fields[1].trimmed();
|
|
|
|
if (!eastAsianWidthMap.contains(widthString)) {
|
|
|
|
qFatal("Unhandled EastAsianWidth property value for %s: %s",
|
2025-08-27 09:59:24 +00:00
|
|
|
fields[0].constData(), widthString.data());
|
2022-05-06 11:44:58 +00:00
|
|
|
}
|
|
|
|
auto width = eastAsianWidthMap.value(widthString);
|
|
|
|
|
|
|
|
for (int codepoint = first; codepoint <= last; ++codepoint) {
|
|
|
|
UnicodeData &ud = UnicodeData::valueRef(codepoint);
|
|
|
|
// Ensure that ranges don't overlap.
|
|
|
|
Q_ASSERT(ud.p.eastAsianWidth == EastAsianWidth::N);
|
|
|
|
ud.p.eastAsianWidth = width;
|
|
|
|
}
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
});
|
2022-05-06 11:44:58 +00:00
|
|
|
}
|
|
|
|
|
2011-04-27 10:05:43 +00:00
|
|
|
static void readDerivedNormalizationProps()
|
|
|
|
{
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
readUnicodeFile("DerivedNormalizationProps.txt",
|
|
|
|
[] (const QByteArray &line, int lineNo) {
|
2011-04-27 10:05:43 +00:00
|
|
|
QList<QByteArray> l = line.split(';');
|
|
|
|
Q_ASSERT(l.size() >= 2);
|
|
|
|
|
|
|
|
QByteArray propName = l[1].trimmed();
|
2013-08-10 12:41:52 +00:00
|
|
|
if (propName != "Full_Composition_Exclusion" &&
|
|
|
|
propName != "NFD_QC" && propName != "NFC_QC" &&
|
|
|
|
propName != "NFKD_QC" && propName != "NFKC_QC") {
|
2011-04-27 10:05:43 +00:00
|
|
|
// ###
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
return;
|
2013-08-10 12:41:52 +00:00
|
|
|
}
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2025-08-27 10:09:12 +00:00
|
|
|
const auto [from, to] = parseHexRange(l[0], lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
for (int codepoint = from; codepoint <= to; ++codepoint) {
|
2012-05-08 01:43:16 +00:00
|
|
|
UnicodeData &d = UnicodeData::valueRef(codepoint);
|
2013-08-10 12:41:52 +00:00
|
|
|
if (propName == "Full_Composition_Exclusion") {
|
|
|
|
d.excludedComposition = true;
|
|
|
|
} else {
|
2020-06-11 09:35:19 +00:00
|
|
|
static_assert(QString::NormalizationForm_D == 0);
|
|
|
|
static_assert(QString::NormalizationForm_C == 1);
|
|
|
|
static_assert(QString::NormalizationForm_KD == 2);
|
|
|
|
static_assert(QString::NormalizationForm_KC == 3);
|
2013-08-10 12:41:52 +00:00
|
|
|
|
|
|
|
QString::NormalizationForm form;
|
|
|
|
if (propName == "NFD_QC")
|
|
|
|
form = QString::NormalizationForm_D;
|
|
|
|
else if (propName == "NFC_QC")
|
|
|
|
form = QString::NormalizationForm_C;
|
|
|
|
else if (propName == "NFKD_QC")
|
|
|
|
form = QString::NormalizationForm_KD;
|
|
|
|
else// if (propName == "NFKC_QC")
|
|
|
|
form = QString::NormalizationForm_KC;
|
|
|
|
|
|
|
|
Q_ASSERT(l.size() == 3);
|
|
|
|
l[2] = l[2].trimmed();
|
|
|
|
|
|
|
|
enum { NFQC_YES = 0, NFQC_NO = 1, NFQC_MAYBE = 3 };
|
|
|
|
uchar ynm = (l[2] == "N" ? NFQC_NO : l[2] == "M" ? NFQC_MAYBE : NFQC_YES);
|
|
|
|
if (ynm == NFQC_MAYBE) {
|
|
|
|
// if this changes, we need to revise the normalizationQuickCheckHelper() implementation
|
|
|
|
Q_ASSERT(form == QString::NormalizationForm_C || form == QString::NormalizationForm_KC);
|
|
|
|
}
|
|
|
|
d.p.nfQuickCheck |= (ynm << (form << 1)); // 2 bits per NF
|
|
|
|
}
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
});
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-05-15 17:48:20 +00:00
|
|
|
for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
|
2012-05-08 01:43:16 +00:00
|
|
|
UnicodeData &d = UnicodeData::valueRef(codepoint);
|
2011-04-27 10:05:43 +00:00
|
|
|
if (!d.excludedComposition
|
|
|
|
&& d.decompositionType == QChar::Canonical
|
|
|
|
&& d.decomposition.size() > 1) {
|
|
|
|
Q_ASSERT(d.decomposition.size() == 2);
|
|
|
|
|
|
|
|
int part1 = d.decomposition.at(0);
|
|
|
|
int part2 = d.decomposition.at(1);
|
|
|
|
|
|
|
|
// all non-starters are listed in DerivedNormalizationProps.txt
|
|
|
|
// and already excluded from composition
|
2012-05-08 01:43:16 +00:00
|
|
|
Q_ASSERT(UnicodeData::valueRef(part1).p.combiningClass == 0);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
++numLigatures;
|
|
|
|
highestLigature = qMax(highestLigature, part1);
|
2012-04-26 16:29:08 +00:00
|
|
|
Ligature l = { part1, part2, codepoint };
|
2011-04-27 10:05:43 +00:00
|
|
|
ligatureHashes[part2].append(l);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
struct NormalizationCorrection {
|
|
|
|
uint codepoint;
|
|
|
|
uint mapped;
|
2012-04-04 23:28:37 +00:00
|
|
|
int version;
|
2011-04-27 10:05:43 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static QByteArray createNormalizationCorrections()
|
|
|
|
{
|
2020-07-31 13:20:48 +00:00
|
|
|
QByteArray out
|
|
|
|
= "struct NormalizationCorrection {\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
" uint ucs4;\n"
|
|
|
|
" uint old_mapping;\n"
|
|
|
|
" int version;\n"
|
|
|
|
"};\n\n"
|
|
|
|
|
2022-05-24 02:16:12 +00:00
|
|
|
"static constexpr NormalizationCorrection uc_normalization_corrections[] = {\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-04-04 23:28:37 +00:00
|
|
|
int maxVersion = 0;
|
2011-04-27 10:05:43 +00:00
|
|
|
int numCorrections = 0;
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
readUnicodeFile("NormalizationCorrections.txt",
|
|
|
|
[&] (QByteArray &line, int lineNo) {
|
2011-04-27 10:05:43 +00:00
|
|
|
line.replace(" ", "");
|
|
|
|
|
|
|
|
Q_ASSERT(!line.contains(".."));
|
|
|
|
|
|
|
|
QList<QByteArray> fields = line.split(';');
|
|
|
|
Q_ASSERT(fields.size() == 4);
|
|
|
|
|
|
|
|
NormalizationCorrection c = { 0, 0, 0 };
|
2025-08-27 17:04:10 +00:00
|
|
|
c.codepoint = parseHex(fields[0], lineNo);
|
|
|
|
c.mapped = parseHex(fields[1], lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
if (fields.at(3) == "3.2.0")
|
|
|
|
c.version = QChar::Unicode_3_2;
|
|
|
|
else if (fields.at(3) == "4.0.0")
|
|
|
|
c.version = QChar::Unicode_4_0;
|
|
|
|
else
|
|
|
|
qFatal("unknown unicode version in NormalizationCorrection.txt");
|
|
|
|
|
2020-07-31 13:20:48 +00:00
|
|
|
out += " { 0x" + QByteArray::number(c.codepoint, 16) + ", 0x"
|
|
|
|
+ QByteArray::number(c.mapped, 16) + ", "
|
|
|
|
+ QByteArray::number(c.version) + " },\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
++numCorrections;
|
2012-04-04 23:28:37 +00:00
|
|
|
maxVersion = qMax(c.version, maxVersion);
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
});
|
2015-11-06 20:31:43 +00:00
|
|
|
if (out.endsWith(",\n"))
|
|
|
|
out.chop(2);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2015-11-06 20:31:43 +00:00
|
|
|
out += "\n};\n\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-04-04 23:28:37 +00:00
|
|
|
"enum { NumNormalizationCorrections = " + QByteArray::number(numCorrections) + " };\n"
|
|
|
|
"enum { NormalizationCorrectionsVersionMax = " + QByteArray::number(maxVersion) + " };\n\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void readLineBreak()
|
|
|
|
{
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
readUnicodeFile("LineBreak.txt",
|
|
|
|
[] (QByteArray &line, int lineNo) {
|
2011-04-27 10:05:43 +00:00
|
|
|
line.replace(" ", "");
|
|
|
|
|
|
|
|
QList<QByteArray> l = line.split(';');
|
|
|
|
Q_ASSERT(l.size() == 2);
|
|
|
|
|
2025-08-27 10:09:12 +00:00
|
|
|
const auto [from, to] = parseHexRange(l[0], lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
LineBreakClass lb = line_break_map.value(l[1], LineBreak_Unassigned);
|
|
|
|
if (lb == LineBreak_Unassigned)
|
|
|
|
qFatal("unassigned line break class: %s", l[1].constData());
|
|
|
|
|
|
|
|
for (int codepoint = from; codepoint <= to; ++codepoint) {
|
2012-05-08 01:43:16 +00:00
|
|
|
UnicodeData &d = UnicodeData::valueRef(codepoint);
|
2012-06-17 01:55:07 +00:00
|
|
|
d.p.lineBreakClass = lb;
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
});
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void readSpecialCasing()
|
|
|
|
{
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
readUnicodeFile("SpecialCasing.txt",
|
|
|
|
[] (const QByteArray &line, int lineNo) {
|
2011-04-27 10:05:43 +00:00
|
|
|
QList<QByteArray> l = line.split(';');
|
|
|
|
|
|
|
|
QByteArray condition = l.size() < 5 ? QByteArray() : l[4].trimmed();
|
|
|
|
if (!condition.isEmpty())
|
|
|
|
// #####
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
return;
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2025-08-27 17:04:10 +00:00
|
|
|
const int codepoint = parseHex(l[0], lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
// if the condition below doesn't hold anymore we need to modify our
|
|
|
|
// lower/upper/title casing code and case folding code
|
2012-04-08 07:18:45 +00:00
|
|
|
Q_ASSERT(!QChar::requiresSurrogates(codepoint));
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2020-07-31 13:36:44 +00:00
|
|
|
// qDebug() << "codepoint" << Qt::hex << codepoint;
|
2011-04-27 10:05:43 +00:00
|
|
|
// qDebug() << line;
|
|
|
|
|
2025-08-27 09:36:31 +00:00
|
|
|
const auto lowerMap = parseHexList(l[1], lineNo);
|
|
|
|
const auto titleMap = parseHexList(l[2], lineNo);
|
|
|
|
const auto upperMap = parseHexList(l[3], lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
|
2012-05-08 01:43:16 +00:00
|
|
|
UnicodeData &ud = UnicodeData::valueRef(codepoint);
|
2011-04-27 10:05:43 +00:00
|
|
|
Q_ASSERT(lowerMap.size() > 1 || lowerMap.at(0) == codepoint + ud.p.lowerCaseDiff);
|
|
|
|
Q_ASSERT(titleMap.size() > 1 || titleMap.at(0) == codepoint + ud.p.titleCaseDiff);
|
|
|
|
Q_ASSERT(upperMap.size() > 1 || upperMap.at(0) == codepoint + ud.p.upperCaseDiff);
|
|
|
|
|
|
|
|
if (lowerMap.size() > 1) {
|
|
|
|
ud.p.lowerCaseSpecial = true;
|
|
|
|
ud.p.lowerCaseDiff = appendToSpecialCaseMap(lowerMap);
|
|
|
|
}
|
|
|
|
if (titleMap.size() > 1) {
|
|
|
|
ud.p.titleCaseSpecial = true;
|
|
|
|
ud.p.titleCaseDiff = appendToSpecialCaseMap(titleMap);
|
|
|
|
}
|
|
|
|
if (upperMap.size() > 1) {
|
|
|
|
ud.p.upperCaseSpecial = true;
|
2012-04-23 03:00:16 +00:00
|
|
|
ud.p.upperCaseDiff = appendToSpecialCaseMap(upperMap);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
});
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int maxCaseFoldDiff = 0;
|
|
|
|
|
|
|
|
static void readCaseFolding()
|
|
|
|
{
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
readUnicodeFile("CaseFolding.txt",
|
|
|
|
[] (const QByteArray &line, int lineNo) {
|
2011-04-27 10:05:43 +00:00
|
|
|
QList<QByteArray> l = line.split(';');
|
|
|
|
|
2025-08-27 17:04:10 +00:00
|
|
|
const int codepoint = parseHex(l[0], lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
l[1] = l[1].trimmed();
|
|
|
|
if (l[1] == "F" || l[1] == "T")
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
return;
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2020-07-31 13:36:44 +00:00
|
|
|
// qDebug() << "codepoint" << Qt::hex << codepoint;
|
2011-04-27 10:05:43 +00:00
|
|
|
// qDebug() << line;
|
2025-08-27 09:36:31 +00:00
|
|
|
const auto foldMap = parseHexList(l[2], lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-05-08 01:43:16 +00:00
|
|
|
UnicodeData &ud = UnicodeData::valueRef(codepoint);
|
2011-04-27 10:05:43 +00:00
|
|
|
if (foldMap.size() == 1) {
|
|
|
|
int caseFolded = foldMap.at(0);
|
|
|
|
int diff = caseFolded - codepoint;
|
2015-11-06 20:31:43 +00:00
|
|
|
// if the conditions below doesn't hold anymore we need to modify our case folding code
|
|
|
|
Q_ASSERT(QChar::requiresSurrogates(codepoint) == QChar::requiresSurrogates(caseFolded));
|
|
|
|
if (QChar::requiresSurrogates(codepoint)) {
|
|
|
|
Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(caseFolded));
|
|
|
|
Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(caseFolded));
|
|
|
|
}
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
if (qAbs(diff) >= (1<<13)) {
|
2012-04-23 15:11:57 +00:00
|
|
|
ud.p.caseFoldSpecial = true;
|
|
|
|
ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
|
|
|
|
} else {
|
|
|
|
ud.p.caseFoldDiff = diff;
|
|
|
|
maxCaseFoldDiff = qMax(maxCaseFoldDiff, qAbs(diff));
|
|
|
|
}
|
2011-04-27 10:05:43 +00:00
|
|
|
} else {
|
|
|
|
qFatal("we currently don't support full case foldings");
|
2020-07-31 13:36:44 +00:00
|
|
|
// qDebug() << "special" << Qt::hex << foldMap;
|
2011-04-27 10:05:43 +00:00
|
|
|
ud.p.caseFoldSpecial = true;
|
|
|
|
ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
|
|
|
|
}
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
});
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void readGraphemeBreak()
|
|
|
|
{
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
readUnicodeFile("GraphemeBreakProperty.txt",
|
|
|
|
[] (QByteArray &line, int lineNo) {
|
2012-06-17 01:20:59 +00:00
|
|
|
|
2011-04-27 10:05:43 +00:00
|
|
|
line.replace(" ", "");
|
|
|
|
|
|
|
|
QList<QByteArray> l = line.split(';');
|
|
|
|
Q_ASSERT(l.size() == 2);
|
|
|
|
|
2025-08-27 10:09:12 +00:00
|
|
|
const auto [from, to] = parseHexRange(l[0], lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-06-17 01:55:07 +00:00
|
|
|
GraphemeBreakClass brk = grapheme_break_map.value(l[1], GraphemeBreak_Unassigned);
|
2011-04-27 10:05:43 +00:00
|
|
|
if (brk == GraphemeBreak_Unassigned)
|
|
|
|
qFatal("unassigned grapheme break class: %s", l[1].constData());
|
|
|
|
|
|
|
|
for (int codepoint = from; codepoint <= to; ++codepoint) {
|
2012-05-08 01:43:16 +00:00
|
|
|
UnicodeData &ud = UnicodeData::valueRef(codepoint);
|
2012-06-17 01:55:07 +00:00
|
|
|
ud.p.graphemeBreakClass = brk;
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
});
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
|
2021-04-15 12:39:51 +00:00
|
|
|
static void readEmojiData()
|
|
|
|
{
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
readUnicodeFile("emoji-data.txt",
|
|
|
|
[] (QByteArray &line, int lineNo) {
|
2021-04-15 12:39:51 +00:00
|
|
|
line.replace(" ", "");
|
|
|
|
|
|
|
|
QList<QByteArray> l = line.split(';');
|
|
|
|
Q_ASSERT(l.size() == 2);
|
|
|
|
|
2024-03-19 07:59:18 +00:00
|
|
|
EmojiFlags emojiFlags = emojiFlagsMap.value(l[1], EmojiFlags::NoEmoji);
|
|
|
|
if (emojiFlags == EmojiFlags::NoEmoji)
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
return;
|
2021-04-15 12:39:51 +00:00
|
|
|
|
2025-08-27 10:09:12 +00:00
|
|
|
const auto [from, to] = parseHexRange(l[0], lineNo);
|
2021-04-15 12:39:51 +00:00
|
|
|
|
|
|
|
for (int codepoint = from; codepoint <= to; ++codepoint) {
|
|
|
|
UnicodeData &ud = UnicodeData::valueRef(codepoint);
|
|
|
|
// Check we're not overwriting the data from GraphemeBreakProperty.txt...
|
2024-03-19 07:59:18 +00:00
|
|
|
Q_ASSERT(emojiFlags != EmojiFlags::Extended_Pictographic
|
|
|
|
// Extended_Pictographic should only replace GB_Any
|
|
|
|
|| ud.p.graphemeBreakClass == GraphemeBreak_Any);
|
|
|
|
if (emojiFlags == EmojiFlags::Extended_Pictographic)
|
|
|
|
ud.p.graphemeBreakClass = GraphemeBreak_Extended_Pictographic;
|
|
|
|
else
|
|
|
|
ud.p.emojiFlags |= int(emojiFlags);
|
2021-04-15 12:39:51 +00:00
|
|
|
}
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
});
|
2021-04-15 12:39:51 +00:00
|
|
|
}
|
|
|
|
|
2011-04-27 10:05:43 +00:00
|
|
|
static void readWordBreak()
|
|
|
|
{
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
readUnicodeFile("WordBreakProperty.txt",
|
|
|
|
[] (QByteArray &line, int lineNo) {
|
2011-04-27 10:05:43 +00:00
|
|
|
line.replace(" ", "");
|
|
|
|
|
|
|
|
QList<QByteArray> l = line.split(';');
|
|
|
|
Q_ASSERT(l.size() == 2);
|
|
|
|
|
2025-08-27 10:09:12 +00:00
|
|
|
const auto [from, to] = parseHexRange(l[0], lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-06-17 01:55:07 +00:00
|
|
|
WordBreakClass brk = word_break_map.value(l[1], WordBreak_Unassigned);
|
2011-04-27 10:05:43 +00:00
|
|
|
if (brk == WordBreak_Unassigned)
|
|
|
|
qFatal("unassigned word break class: %s", l[1].constData());
|
|
|
|
|
|
|
|
for (int codepoint = from; codepoint <= to; ++codepoint) {
|
2012-11-22 01:25:05 +00:00
|
|
|
// ### [
|
|
|
|
// as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
|
|
|
|
// which caused "hi.there" to be treated like if it were just a single word;
|
|
|
|
// until we have a tailoring mechanism, retain the old behavior by remapping those characters here.
|
|
|
|
if (codepoint == 0x002E) // FULL STOP
|
|
|
|
brk = WordBreak_MidNum;
|
|
|
|
else if (codepoint == 0x003A) // COLON
|
2017-12-12 09:14:28 +00:00
|
|
|
brk = WordBreak_Any;
|
2012-11-22 01:25:05 +00:00
|
|
|
// ] ###
|
2012-05-08 01:43:16 +00:00
|
|
|
UnicodeData &ud = UnicodeData::valueRef(codepoint);
|
2012-06-17 01:55:07 +00:00
|
|
|
ud.p.wordBreakClass = brk;
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
});
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void readSentenceBreak()
|
|
|
|
{
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
readUnicodeFile("SentenceBreakProperty.txt",
|
|
|
|
[] (QByteArray &line, int lineNo) {
|
2011-04-27 10:05:43 +00:00
|
|
|
line.replace(" ", "");
|
|
|
|
|
|
|
|
QList<QByteArray> l = line.split(';');
|
|
|
|
Q_ASSERT(l.size() == 2);
|
|
|
|
|
2025-08-27 10:09:12 +00:00
|
|
|
const auto [from, to] = parseHexRange(l[0], lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-06-17 01:55:07 +00:00
|
|
|
SentenceBreakClass brk = sentence_break_map.value(l[1], SentenceBreak_Unassigned);
|
2011-04-27 10:05:43 +00:00
|
|
|
if (brk == SentenceBreak_Unassigned)
|
|
|
|
qFatal("unassigned sentence break class: %s", l[1].constData());
|
|
|
|
|
|
|
|
for (int codepoint = from; codepoint <= to; ++codepoint) {
|
2012-05-08 01:43:16 +00:00
|
|
|
UnicodeData &ud = UnicodeData::valueRef(codepoint);
|
2012-06-17 01:55:07 +00:00
|
|
|
ud.p.sentenceBreakClass = brk;
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
});
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
// this piece of code does full case folding and comparison. We currently
|
|
|
|
// don't use it, since this gives lots of issues with things as case insensitive
|
|
|
|
// search and replace.
|
|
|
|
static inline void foldCase(uint ch, ushort *out)
|
|
|
|
{
|
|
|
|
const QUnicodeTables::Properties *p = qGetProp(ch);
|
|
|
|
if (!p->caseFoldSpecial) {
|
|
|
|
*(out++) = ch + p->caseFoldDiff;
|
|
|
|
} else {
|
|
|
|
const ushort *folded = specialCaseMap + p->caseFoldDiff;
|
2011-10-18 17:12:20 +00:00
|
|
|
ushort length = *folded++;
|
|
|
|
while (length--)
|
2011-04-27 10:05:43 +00:00
|
|
|
*out++ = *folded++;
|
|
|
|
}
|
|
|
|
*out = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ucstricmp(const ushort *a, const ushort *ae, const ushort *b, const ushort *be)
|
|
|
|
{
|
|
|
|
if (a == b)
|
|
|
|
return 0;
|
|
|
|
if (a == 0)
|
|
|
|
return 1;
|
|
|
|
if (b == 0)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
while (a != ae && b != be) {
|
|
|
|
const QUnicodeTables::Properties *pa = qGetProp(*a);
|
|
|
|
const QUnicodeTables::Properties *pb = qGetProp(*b);
|
|
|
|
if (pa->caseFoldSpecial | pb->caseFoldSpecial)
|
|
|
|
goto special;
|
|
|
|
int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
|
|
|
|
if ((diff))
|
|
|
|
return diff;
|
|
|
|
++a;
|
|
|
|
++b;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (a == ae) {
|
|
|
|
if (b == be)
|
|
|
|
return 0;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
special:
|
|
|
|
ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
|
|
|
|
ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
|
|
|
|
abuf[0] = bbuf[0] = 0;
|
|
|
|
ushort *ap = abuf;
|
|
|
|
ushort *bp = bbuf;
|
|
|
|
while (1) {
|
|
|
|
if (!*ap) {
|
|
|
|
if (a == ae) {
|
|
|
|
if (!*bp && b == be)
|
|
|
|
return 0;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
foldCase(*(a++), abuf);
|
|
|
|
ap = abuf;
|
|
|
|
}
|
|
|
|
if (!*bp) {
|
|
|
|
if (b == be)
|
|
|
|
return 1;
|
|
|
|
foldCase(*(b++), bbuf);
|
|
|
|
bp = bbuf;
|
|
|
|
}
|
|
|
|
if (*ap != *bp)
|
|
|
|
return (int)*ap - (int)*bp;
|
|
|
|
++ap;
|
|
|
|
++bp;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b)
|
|
|
|
{
|
|
|
|
if (a == 0)
|
|
|
|
return 1;
|
|
|
|
if (b == 0)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
while (a != ae && *b) {
|
|
|
|
const QUnicodeTables::Properties *pa = qGetProp(*a);
|
|
|
|
const QUnicodeTables::Properties *pb = qGetProp((ushort)*b);
|
|
|
|
if (pa->caseFoldSpecial | pb->caseFoldSpecial)
|
|
|
|
goto special;
|
|
|
|
int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
|
|
|
|
if ((diff))
|
|
|
|
return diff;
|
|
|
|
++a;
|
|
|
|
++b;
|
|
|
|
}
|
|
|
|
if (a == ae) {
|
|
|
|
if (!*b)
|
|
|
|
return 0;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
special:
|
|
|
|
ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
|
|
|
|
ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
|
|
|
|
abuf[0] = bbuf[0] = 0;
|
|
|
|
ushort *ap = abuf;
|
|
|
|
ushort *bp = bbuf;
|
|
|
|
while (1) {
|
|
|
|
if (!*ap) {
|
|
|
|
if (a == ae) {
|
|
|
|
if (!*bp && !*b)
|
|
|
|
return 0;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
foldCase(*(a++), abuf);
|
|
|
|
ap = abuf;
|
|
|
|
}
|
|
|
|
if (!*bp) {
|
|
|
|
if (!*b)
|
|
|
|
return 1;
|
|
|
|
foldCase(*(b++), bbuf);
|
|
|
|
bp = bbuf;
|
|
|
|
}
|
|
|
|
if (*ap != *bp)
|
|
|
|
return (int)*ap - (int)*bp;
|
|
|
|
++ap;
|
|
|
|
++bp;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
static QList<QByteArray> blockNames;
|
|
|
|
struct BlockInfo
|
|
|
|
{
|
|
|
|
int blockIndex;
|
|
|
|
int firstCodePoint;
|
|
|
|
int lastCodePoint;
|
|
|
|
};
|
|
|
|
static QList<BlockInfo> blockInfoList;
|
|
|
|
|
|
|
|
static void readBlocks()
|
|
|
|
{
|
2012-06-17 01:20:59 +00:00
|
|
|
qDebug("Reading Blocks.txt");
|
|
|
|
|
2011-04-27 10:05:43 +00:00
|
|
|
QFile f("data/Blocks.txt");
|
2025-01-17 12:13:12 +00:00
|
|
|
if (!f.open(QFile::ReadOnly))
|
|
|
|
qFatal() << "Couldn't open Blocks.txt:" << f.errorString();
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
while (!f.atEnd()) {
|
|
|
|
QByteArray line = f.readLine();
|
|
|
|
line.resize(line.size() - 1);
|
|
|
|
|
|
|
|
int comment = line.indexOf("#");
|
|
|
|
if (comment >= 0)
|
|
|
|
line = line.left(comment);
|
|
|
|
|
|
|
|
line.replace(" ", "");
|
|
|
|
|
|
|
|
if (line.isEmpty())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
int semicolon = line.indexOf(';');
|
|
|
|
Q_ASSERT(semicolon >= 0);
|
|
|
|
QByteArray codePoints = line.left(semicolon);
|
|
|
|
QByteArray blockName = line.mid(semicolon + 1);
|
|
|
|
|
|
|
|
int blockIndex = blockNames.indexOf(blockName);
|
|
|
|
if (blockIndex == -1) {
|
|
|
|
blockIndex = blockNames.size();
|
|
|
|
blockNames.append(blockName);
|
|
|
|
}
|
|
|
|
|
|
|
|
codePoints.replace("..", ".");
|
|
|
|
QList<QByteArray> cl = codePoints.split('.');
|
|
|
|
|
|
|
|
bool ok;
|
|
|
|
int first = cl[0].toInt(&ok, 16);
|
|
|
|
Q_ASSERT(ok);
|
|
|
|
int last = first;
|
|
|
|
if (cl.size() == 2) {
|
|
|
|
last = cl[1].toInt(&ok, 16);
|
|
|
|
Q_ASSERT(ok);
|
|
|
|
}
|
|
|
|
|
|
|
|
BlockInfo blockInfo = { blockIndex, first, last };
|
|
|
|
blockInfoList.append(blockInfo);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static void readScripts()
|
|
|
|
{
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
readUnicodeFile("Scripts.txt",
|
|
|
|
[] (QByteArray &line, int lineNo) {
|
2012-06-11 12:19:24 +00:00
|
|
|
line.replace(" ", "");
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-06-11 12:19:24 +00:00
|
|
|
if (line.isEmpty())
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
return;
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-06-11 12:19:24 +00:00
|
|
|
int semicolon = line.indexOf(';');
|
|
|
|
Q_ASSERT(semicolon >= 0);
|
|
|
|
QByteArray codePoints = line.left(semicolon);
|
|
|
|
QByteArray scriptName = line.mid(semicolon + 1);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2025-08-27 10:09:12 +00:00
|
|
|
const auto [first, last] = parseHexRange(codePoints, lineNo);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-12-08 03:36:49 +00:00
|
|
|
if (!scriptMap.contains(scriptName))
|
|
|
|
qFatal("Unhandled script property value: %s", scriptName.constData());
|
|
|
|
QChar::Script script = scriptMap.value(scriptName, QChar::Script_Unknown);
|
|
|
|
|
2012-06-11 12:19:24 +00:00
|
|
|
for (int codepoint = first; codepoint <= last; ++codepoint) {
|
|
|
|
UnicodeData &ud = UnicodeData::valueRef(codepoint);
|
2012-12-08 03:36:49 +00:00
|
|
|
ud.p.script = script;
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
});
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
|
2021-08-19 14:29:43 +00:00
|
|
|
static QMap<char32_t, QString> idnaMappingTable;
|
2021-07-30 10:09:46 +00:00
|
|
|
|
|
|
|
static void readIdnaMappingTable()
|
|
|
|
{
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
readUnicodeFile("IdnaMappingTable.txt",
|
2025-08-27 14:12:46 +00:00
|
|
|
[] (const QByteArray &line, int lineNo) {
|
2021-07-30 10:09:46 +00:00
|
|
|
QList<QByteArray> fields = line.split(';');
|
|
|
|
Q_ASSERT(fields.size() >= 2);
|
|
|
|
|
2025-08-27 10:09:12 +00:00
|
|
|
const auto [first, last] = parseHexRange(fields[0], lineNo);
|
2021-07-30 10:09:46 +00:00
|
|
|
|
|
|
|
const QByteArray statusString = fields[1].trimmed();
|
|
|
|
if (!idnaStatusMap.contains(statusString))
|
|
|
|
qFatal("Unhandled IDNA status property value for %s: %s",
|
2025-08-27 09:59:24 +00:00
|
|
|
fields[0].constData(), statusString.data());
|
2021-07-30 10:09:46 +00:00
|
|
|
IdnaRawStatus rawStatus = idnaStatusMap.value(statusString);
|
|
|
|
|
2021-08-19 14:29:43 +00:00
|
|
|
QString mapping;
|
2021-07-30 10:09:46 +00:00
|
|
|
|
|
|
|
switch (rawStatus) {
|
|
|
|
case IdnaRawStatus::Disallowed:
|
|
|
|
case IdnaRawStatus::Valid:
|
|
|
|
case IdnaRawStatus::Ignored:
|
|
|
|
case IdnaRawStatus::DisallowedStd3Valid:
|
|
|
|
break;
|
|
|
|
|
|
|
|
case IdnaRawStatus::Mapped:
|
|
|
|
case IdnaRawStatus::Deviation:
|
|
|
|
case IdnaRawStatus::DisallowedStd3Mapped:
|
|
|
|
Q_ASSERT(fields.size() >= 3);
|
|
|
|
|
2025-08-27 14:12:46 +00:00
|
|
|
for (char32_t val : parseHexList(fields[2], lineNo))
|
|
|
|
mapping.append(QChar::fromUcs4(val));
|
2021-07-30 10:09:46 +00:00
|
|
|
|
|
|
|
// Some deviations have empty mappings, others should not...
|
|
|
|
if (mapping.isEmpty()) {
|
|
|
|
Q_ASSERT(rawStatus == IdnaRawStatus::Deviation);
|
2025-08-27 09:59:24 +00:00
|
|
|
qDebug() << " Empty IDNA mapping for" << fields[0];
|
2021-07-30 10:09:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int codepoint = first; codepoint <= last; ++codepoint) {
|
|
|
|
UnicodeData &ud = UnicodeData::valueRef(codepoint);
|
|
|
|
// Ensure that ranges don't overlap.
|
|
|
|
Q_ASSERT(ud.idnaRawStatus == IdnaRawStatus::Disallowed);
|
|
|
|
ud.idnaRawStatus = rawStatus;
|
|
|
|
|
|
|
|
// ASCII codepoints are skipped here because they are processed in separate
|
|
|
|
// optimized code paths that do not use this mapping table.
|
|
|
|
if (codepoint >= 0x80 && !mapping.isEmpty())
|
|
|
|
idnaMappingTable[codepoint] = mapping;
|
|
|
|
}
|
util/unicode: Extract Method readUnicodeFile()
There's about a dozen files this program reads, and in each of these
cases, the code to read the file line-by-line, remove comments (or
just LF) and trim the line before further handling is duplicated. It's
also very inefficient, we have better APIs these days (readLineInto(),
rvalue *this overloads, truncate() instead of = left(), ...). Besides,
as Mårten pointed out in review, trimmed() already removes the LF, so
we don't need to do it manually.
So Extract Method readUnicodeFile() that does that, coroutine-style
(but with function object for now), from all the readX() functions
(except readUnicodeData() itself, which is using nested readLine()s.
Also maintain a line number for later improving the error messages.
Remove some isEmpty() checks in the lambdas that, after the
refactoring, can never be true (because removing whitespace from a
trimmed() string cannot make the string empty, ditto with
simplified()).
The extracted function could even pre-split the line along `;`, but
for that, I would port each lambda to use QByteArrayView / qTokenizer
first.
Picking to all active branches, because a) this is a tool and b) we
continue to update the Unicode tables in all active branches, so the
tool to do so should not differ, unless the target branch requires it
(changed data structures, e.g.). Note that readLineInto() is not in
6.5, but the tool is not required to be built against the Qt version
it is building tables for, so we can use the latest Qt features here.
Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I3b699f213c98baa45bc8bbdb7ae2ac985d893798
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-26 17:10:45 +00:00
|
|
|
});
|
2021-07-30 10:09:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
Resolve IDNA status by deciding whether to allow STD3 violations
|
|
|
|
|
|
|
|
Underscores are normally prohibited by STD3 rules but Qt allows underscores
|
|
|
|
to be used inside URLs (see QTBUG-7434 for example). This code changes the
|
|
|
|
underscore status to Valid. The same is done to mapped codepoints that
|
|
|
|
map to underscores combined with other Valid codepoints.
|
|
|
|
|
|
|
|
Underscores in domain names are required when using DNS-SD protocol and they
|
|
|
|
are also allowed by the SMB protocol.
|
|
|
|
*/
|
|
|
|
static void resolveIdnaStatus()
|
|
|
|
{
|
|
|
|
qDebug("resolveIdnaStatus:");
|
|
|
|
|
|
|
|
UnicodeData::valueRef(u'_').idnaRawStatus = IdnaRawStatus::Valid;
|
|
|
|
|
|
|
|
for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
|
|
|
|
UnicodeData &ud = UnicodeData::valueRef(codepoint);
|
|
|
|
switch (ud.idnaRawStatus) {
|
|
|
|
case IdnaRawStatus::Disallowed:
|
|
|
|
case IdnaRawStatus::Valid:
|
|
|
|
case IdnaRawStatus::Ignored:
|
|
|
|
case IdnaRawStatus::Deviation:
|
|
|
|
case IdnaRawStatus::Mapped:
|
|
|
|
ud.p.idnaStatus = static_cast<IdnaStatus>(ud.idnaRawStatus);
|
|
|
|
break;
|
|
|
|
case IdnaRawStatus::DisallowedStd3Valid:
|
|
|
|
ud.p.idnaStatus = IdnaStatus::Disallowed;
|
|
|
|
break;
|
|
|
|
case IdnaRawStatus::DisallowedStd3Mapped: {
|
|
|
|
Q_ASSERT(idnaMappingTable.contains(codepoint));
|
|
|
|
const auto &mapping = idnaMappingTable[codepoint];
|
|
|
|
|
2021-08-19 14:29:43 +00:00
|
|
|
bool allow = true;
|
|
|
|
for (QStringIterator iter(mapping); iter.hasNext();) {
|
|
|
|
if (UnicodeData::valueRef(iter.next()).idnaRawStatus != IdnaRawStatus::Valid) {
|
|
|
|
allow = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2021-07-30 10:09:46 +00:00
|
|
|
|
|
|
|
if (allow) {
|
|
|
|
qDebug() << " Allowing" << Qt::hex << codepoint;
|
|
|
|
ud.p.idnaStatus = IdnaStatus::Mapped;
|
|
|
|
} else {
|
|
|
|
ud.p.idnaStatus = IdnaStatus::Disallowed;
|
|
|
|
idnaMappingTable.remove(codepoint);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-30 09:35:02 +00:00
|
|
|
/*
|
|
|
|
Return maximum overlap for strings left and right in this order.
|
|
|
|
|
|
|
|
The input strings should not be substrings of each other.
|
|
|
|
*/
|
|
|
|
static qsizetype overlap(const QString &left, const QString &right)
|
|
|
|
{
|
|
|
|
for (qsizetype n = std::min(left.size(), right.size()) - 1; n > 0; n--) {
|
|
|
|
if (left.last(n) == right.first(n))
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
using GraphNode = unsigned int;
|
|
|
|
|
|
|
|
struct OverlapGraphEdge
|
|
|
|
{
|
|
|
|
GraphNode start;
|
|
|
|
GraphNode end;
|
|
|
|
qsizetype overlap;
|
|
|
|
};
|
|
|
|
|
2021-08-19 14:29:43 +00:00
|
|
|
/*
|
2021-09-15 13:31:13 +00:00
|
|
|
Returns a common superstring of all inputs.
|
2021-08-19 14:29:43 +00:00
|
|
|
|
|
|
|
Ideally this function would return the superstring of the smallest
|
|
|
|
possible size, but the shortest common superstring problem is know to be
|
|
|
|
NP-hard so an approximation must be used here.
|
|
|
|
|
2021-08-30 09:35:02 +00:00
|
|
|
This function implements the greedy algorithm for building the superstring.
|
2021-08-19 14:29:43 +00:00
|
|
|
|
|
|
|
As an optimization this function is allowed to destroy its inputs.
|
|
|
|
*/
|
2025-08-25 07:51:24 +00:00
|
|
|
static QString buildSuperstring(QList<QString> &&inputs)
|
2021-08-19 14:29:43 +00:00
|
|
|
{
|
2021-08-30 09:35:02 +00:00
|
|
|
// Ensure that the inputs don't contain substrings.
|
|
|
|
// First, sort the array by length to make substring removal easier.
|
2021-08-19 14:29:43 +00:00
|
|
|
std::sort(inputs.begin(), inputs.end(), [](const QString &a, const QString &b) {
|
|
|
|
return a.size() == b.size() ? a > b : a.size() > b.size();
|
|
|
|
});
|
|
|
|
|
2021-08-30 09:35:02 +00:00
|
|
|
// Remove duplicates and other substrings
|
|
|
|
for (auto i = inputs.begin() + 1; i != inputs.end();) {
|
|
|
|
bool isSubstring = std::any_of(inputs.begin(), i, [i](const QString &s) {
|
|
|
|
return s.contains(*i);
|
|
|
|
});
|
|
|
|
i = isSubstring ? inputs.erase(i) : i + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build overlap graph for the remaining inputs. It is fully-connected.
|
|
|
|
QList<OverlapGraphEdge> graphEdges;
|
|
|
|
graphEdges.reserve(inputs.size() * (inputs.size() - 1));
|
|
|
|
|
|
|
|
for (GraphNode i = 0; i < inputs.size(); i++) {
|
|
|
|
for (GraphNode j = 0; j < inputs.size(); j++) {
|
|
|
|
if (i != j)
|
|
|
|
graphEdges.append(OverlapGraphEdge {i, j, overlap(inputs[i], inputs[j])});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build a Hamiltonian path through the overlap graph, taking nodes with highest overlap
|
|
|
|
// first.
|
|
|
|
std::sort(graphEdges.begin(), graphEdges.end(), [](const auto &a, const auto &b) {
|
|
|
|
return a.overlap == b.overlap
|
|
|
|
? a.start == b.start ? a.end < b.end : a.start < b.start
|
|
|
|
: a.overlap > b.overlap;
|
|
|
|
});
|
|
|
|
|
|
|
|
QBitArray starts(inputs.size());
|
|
|
|
QBitArray ends(inputs.size());
|
|
|
|
QMap<GraphNode, OverlapGraphEdge> pathEdges;
|
|
|
|
|
|
|
|
auto createsCycle = [&](const OverlapGraphEdge &edge) {
|
|
|
|
if (!starts[edge.end] || !ends[edge.start])
|
|
|
|
return false;
|
|
|
|
Q_ASSERT(!pathEdges.contains(edge.start)); // Caller checks it's not yet a start.
|
|
|
|
|
|
|
|
GraphNode node = edge.end;
|
|
|
|
while (pathEdges.contains(node))
|
|
|
|
node = pathEdges[node].end;
|
|
|
|
|
|
|
|
return node == edge.start;
|
|
|
|
};
|
|
|
|
|
|
|
|
for (const auto &edge : graphEdges) {
|
|
|
|
if (!starts[edge.start] && !ends[edge.end] && !createsCycle(edge)) {
|
|
|
|
starts.setBit(edge.start);
|
|
|
|
ends.setBit(edge.end);
|
|
|
|
pathEdges[edge.start] = edge;
|
|
|
|
if (pathEdges.size() == inputs.size() - 1)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Q_ASSERT(ends.count(false) == 1);
|
|
|
|
Q_ASSERT(starts.count(false) == 1);
|
|
|
|
|
|
|
|
// Find the start node of the path.
|
|
|
|
GraphNode node = 0;
|
|
|
|
while (node < ends.size() && ends[node])
|
|
|
|
node++;
|
|
|
|
Q_ASSERT(node < ends.size());
|
|
|
|
|
|
|
|
QString superstring = inputs[node];
|
|
|
|
qsizetype pathNodes = 1; // Count path nodes for sanity check
|
|
|
|
|
|
|
|
while (pathEdges.contains(node)) {
|
|
|
|
const auto &edge = pathEdges[node];
|
|
|
|
Q_ASSERT(edge.start == node);
|
|
|
|
|
|
|
|
superstring.append(QStringView { inputs[edge.end] }.sliced(edge.overlap));
|
2021-08-19 14:29:43 +00:00
|
|
|
|
2021-08-30 09:35:02 +00:00
|
|
|
node = edge.end;
|
|
|
|
pathNodes++;
|
2021-08-19 14:29:43 +00:00
|
|
|
}
|
2021-08-30 09:35:02 +00:00
|
|
|
Q_ASSERT(pathNodes == inputs.size());
|
2021-08-19 14:29:43 +00:00
|
|
|
|
|
|
|
return superstring;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
Stores IDNA mapping information.
|
|
|
|
|
|
|
|
The mapping table is an array of IdnaMapEntry instances sorted
|
|
|
|
by codePoint. For mapping resulting in a single QChar, that character
|
|
|
|
is stored inside the entry in charOrOffset. Otherwise the entry contains
|
|
|
|
offset inside idnaMappingData array.
|
|
|
|
|
|
|
|
It should be possible to find all mapped strings with size > 1 inside
|
|
|
|
idnaMappingData, otherwise the construction of this array should be optimized
|
|
|
|
to take advantage of common substrings and minimize the data size.
|
|
|
|
*/
|
2021-07-30 10:09:46 +00:00
|
|
|
static QByteArray createIdnaMapping()
|
|
|
|
{
|
|
|
|
qDebug("createIdnaMapping:");
|
|
|
|
|
2021-08-19 14:29:43 +00:00
|
|
|
QList<QString> values;
|
|
|
|
values.reserve(idnaMappingTable.size());
|
|
|
|
qsizetype uncompressedSize = 0;
|
2021-07-30 10:09:46 +00:00
|
|
|
|
2021-08-19 14:29:43 +00:00
|
|
|
for (const auto &v : idnaMappingTable.values()) {
|
2021-08-19 14:47:58 +00:00
|
|
|
if (v.size() > 2) {
|
2021-08-19 14:29:43 +00:00
|
|
|
values.append(v);
|
|
|
|
uncompressedSize += v.size();
|
|
|
|
}
|
2021-07-30 10:09:46 +00:00
|
|
|
}
|
|
|
|
|
2025-08-25 07:51:24 +00:00
|
|
|
QString idnaMappingData = buildSuperstring(std::move(values));
|
2021-08-19 14:29:43 +00:00
|
|
|
qDebug() << " uncompressed size:" << uncompressedSize << "characters";
|
|
|
|
qDebug() << " consolidated size:" << idnaMappingData.size() << "characters";
|
|
|
|
|
2021-07-30 10:09:46 +00:00
|
|
|
qsizetype memoryUsage = 0;
|
2021-08-19 14:29:43 +00:00
|
|
|
|
2021-07-30 10:09:46 +00:00
|
|
|
QByteArray out =
|
2022-05-24 02:16:12 +00:00
|
|
|
"static constexpr char16_t idnaMappingData[] = {";
|
2021-08-19 14:29:43 +00:00
|
|
|
|
|
|
|
int col = 0;
|
|
|
|
for (auto c : idnaMappingData) {
|
|
|
|
if (col == 0)
|
|
|
|
out += "\n ";
|
|
|
|
out += " 0x" + QByteArray::number(c.unicode(), 16) + ",";
|
|
|
|
col = (col + 1) % 12;
|
|
|
|
memoryUsage += 2;
|
|
|
|
}
|
|
|
|
out += "\n};\n\n";
|
|
|
|
|
|
|
|
// Check if the values fit into IdnaMapEntry below.
|
|
|
|
Q_ASSERT(idnaMappingData.size() < (1 << 16));
|
|
|
|
|
|
|
|
// This could be written more elegantly with a union and designated initializers,
|
|
|
|
// but designated initizers is a C++20 feature
|
|
|
|
out +=
|
2021-07-30 10:09:46 +00:00
|
|
|
"struct IdnaMapEntry {\n"
|
2021-08-19 14:47:58 +00:00
|
|
|
" // 21 bits suffice for any valid code-point (LastValidCodePoint = 0x10ffff)\n"
|
|
|
|
" unsigned codePoint : 24;\n"
|
|
|
|
" unsigned size : 8;\n"
|
|
|
|
" char16_t ucs[2]; // ucs[0] is offset if size > 2\n"
|
2021-08-19 14:29:43 +00:00
|
|
|
"};\n"
|
|
|
|
"static_assert(sizeof(IdnaMapEntry) == 8);\n\n"
|
2022-05-24 02:16:12 +00:00
|
|
|
"static constexpr IdnaMapEntry idnaMap[] = {\n";
|
2021-07-30 10:09:46 +00:00
|
|
|
|
|
|
|
for (auto i = idnaMappingTable.keyValueBegin(); i != idnaMappingTable.keyValueEnd(); i++) {
|
2021-08-19 14:29:43 +00:00
|
|
|
const QString &mapping = i->second;
|
|
|
|
Q_ASSERT(!mapping.isEmpty());
|
|
|
|
|
|
|
|
qsizetype mappingIndex = idnaMappingData.indexOf(mapping);
|
2021-08-19 14:47:58 +00:00
|
|
|
Q_ASSERT(mappingIndex >= 0 || mapping.size() <= 2);
|
2021-08-19 14:29:43 +00:00
|
|
|
|
|
|
|
out += " { 0x" + QByteArray::number(i->first, 16) +
|
|
|
|
", " + QByteArray::number(mapping.size());
|
2021-08-19 14:47:58 +00:00
|
|
|
if (mapping.size() <= 2) {
|
|
|
|
out += ", { 0x" + QByteArray::number(mapping[0].unicode(), 16);
|
|
|
|
if (mapping.size() == 2)
|
|
|
|
out += ", 0x" + QByteArray::number(mapping[1].unicode(), 16);
|
|
|
|
else
|
|
|
|
out += ", 0";
|
|
|
|
} else {
|
|
|
|
out += ", { " + QByteArray::number(mappingIndex);
|
|
|
|
out += ", 0";
|
|
|
|
}
|
|
|
|
out += " } },\n";
|
2021-08-19 14:29:43 +00:00
|
|
|
memoryUsage += 8;
|
2021-07-30 10:09:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
qDebug() << " memory usage:" << memoryUsage << "bytes";
|
|
|
|
|
|
|
|
out +=
|
|
|
|
"};\n\n"
|
2021-08-19 14:29:43 +00:00
|
|
|
"Q_CORE_EXPORT QStringView QT_FASTCALL idnaMapping(char32_t ucs4) noexcept\n"
|
2021-07-30 10:09:46 +00:00
|
|
|
"{\n"
|
|
|
|
" auto i = std::lower_bound(std::begin(idnaMap), std::end(idnaMap), ucs4,\n"
|
|
|
|
" [](const auto &p, char32_t c) { return p.codePoint < c; });\n"
|
2021-08-19 14:29:43 +00:00
|
|
|
" if (i == std::end(idnaMap) || i->codePoint != ucs4)\n"
|
|
|
|
" return {};\n\n"
|
2021-08-19 14:47:58 +00:00
|
|
|
" return QStringView(i->size > 2 ? idnaMappingData + i->ucs[0] : i->ucs, i->size);\n"
|
2021-07-30 10:09:46 +00:00
|
|
|
"}\n\n";
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
2011-04-27 10:05:43 +00:00
|
|
|
#if 0
|
|
|
|
static void dump(int from, int to)
|
|
|
|
{
|
|
|
|
for (int i = from; i <= to; ++i) {
|
2012-05-08 01:43:16 +00:00
|
|
|
UnicodeData &d = UnicodeData::valueRef(i);
|
2011-04-27 10:05:43 +00:00
|
|
|
qDebug("0x%04x: cat=%d combining=%d dir=%d case=%x mirror=%x joining=%d age=%d",
|
|
|
|
i, d.p.category, d.p.combiningClass, d.p.direction, d.otherCase, d.mirroredChar, d.p.joining, d.p.age);
|
|
|
|
if (d.decompositionType != QChar::NoDecomposition) {
|
|
|
|
qDebug(" decomposition: type=%d, length=%d, first=%x", d.decompositionType, d.decomposition.size(),
|
|
|
|
d.decomposition[0]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
qDebug(" ");
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2012-06-17 01:20:59 +00:00
|
|
|
static QList<PropertyFlags> uniqueProperties;
|
|
|
|
|
|
|
|
static void computeUniqueProperties()
|
|
|
|
{
|
|
|
|
qDebug("computeUniqueProperties:");
|
|
|
|
for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
|
|
|
|
UnicodeData &d = UnicodeData::valueRef(codepoint);
|
|
|
|
int index = uniqueProperties.indexOf(d.p);
|
|
|
|
if (index == -1) {
|
|
|
|
index = uniqueProperties.size();
|
|
|
|
uniqueProperties.append(d.p);
|
|
|
|
}
|
|
|
|
d.propertyIndex = index;
|
|
|
|
}
|
2025-01-17 12:13:47 +00:00
|
|
|
qDebug(" %" PRIdQSIZETYPE " unique unicode properties found", uniqueProperties.size());
|
2012-06-17 01:20:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
struct UniqueBlock {
|
|
|
|
inline UniqueBlock() : index(-1) {}
|
|
|
|
|
|
|
|
inline bool operator==(const UniqueBlock &other) const
|
|
|
|
{ return values == other.values; }
|
|
|
|
|
2011-04-27 10:05:43 +00:00
|
|
|
int index;
|
2020-07-07 10:04:21 +00:00
|
|
|
QList<int> values;
|
2011-04-27 10:05:43 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static QByteArray createPropertyInfo()
|
|
|
|
{
|
|
|
|
qDebug("createPropertyInfo:");
|
|
|
|
|
2012-04-23 03:00:16 +00:00
|
|
|
// we reserve one bit more than in the assert below for the sign
|
|
|
|
Q_ASSERT(maxMirroredDiff < (1<<12));
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
Q_ASSERT(maxLowerCaseDiff < (1<<13));
|
|
|
|
Q_ASSERT(maxUpperCaseDiff < (1<<13));
|
|
|
|
Q_ASSERT(maxTitleCaseDiff < (1<<13));
|
|
|
|
Q_ASSERT(maxCaseFoldDiff < (1<<13));
|
2012-04-23 03:00:16 +00:00
|
|
|
|
2011-04-27 10:05:43 +00:00
|
|
|
const int BMP_BLOCKSIZE = 32;
|
|
|
|
const int BMP_SHIFT = 5;
|
|
|
|
const int BMP_END = 0x11000;
|
|
|
|
const int SMP_END = 0x110000;
|
|
|
|
const int SMP_BLOCKSIZE = 256;
|
|
|
|
const int SMP_SHIFT = 8;
|
|
|
|
|
2012-06-17 01:20:59 +00:00
|
|
|
QList<UniqueBlock> uniqueBlocks;
|
2020-07-07 10:04:21 +00:00
|
|
|
QList<int> blockMap;
|
2011-04-27 10:05:43 +00:00
|
|
|
int used = 0;
|
|
|
|
|
2020-08-12 12:17:50 +00:00
|
|
|
// Group BMP data into blocks indexed by their 12 most significant bits
|
|
|
|
// (blockId = ucs >> 5):
|
2011-04-27 10:05:43 +00:00
|
|
|
for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
|
2012-06-17 01:20:59 +00:00
|
|
|
UniqueBlock b;
|
|
|
|
b.values.reserve(BMP_BLOCKSIZE);
|
2011-04-27 10:05:43 +00:00
|
|
|
for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
|
|
|
|
int uc = block*BMP_BLOCKSIZE + i;
|
2012-05-08 01:43:16 +00:00
|
|
|
UnicodeData &d = UnicodeData::valueRef(uc);
|
2012-06-17 01:20:59 +00:00
|
|
|
b.values.append(d.propertyIndex);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
int index = uniqueBlocks.indexOf(b);
|
2011-04-27 10:05:43 +00:00
|
|
|
if (index == -1) {
|
2012-06-17 01:20:59 +00:00
|
|
|
index = uniqueBlocks.size();
|
2011-04-27 10:05:43 +00:00
|
|
|
b.index = used;
|
|
|
|
used += BMP_BLOCKSIZE;
|
2012-06-17 01:20:59 +00:00
|
|
|
uniqueBlocks.append(b);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
blockMap.append(uniqueBlocks.at(index).index);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
int bmp_blocks = uniqueBlocks.size();
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2020-08-12 12:17:50 +00:00
|
|
|
// Group SMP data into blocks indexed by their 9 most significant bits, plus
|
|
|
|
// an offset to put them after the BMP blocks (blockId = (ucs >> 8) + 0x880):
|
2011-04-27 10:05:43 +00:00
|
|
|
for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
|
2012-06-17 01:20:59 +00:00
|
|
|
UniqueBlock b;
|
|
|
|
b.values.reserve(SMP_BLOCKSIZE);
|
2011-04-27 10:05:43 +00:00
|
|
|
for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
|
|
|
|
int uc = block*SMP_BLOCKSIZE + i;
|
2012-05-08 01:43:16 +00:00
|
|
|
UnicodeData &d = UnicodeData::valueRef(uc);
|
2012-06-17 01:20:59 +00:00
|
|
|
b.values.append(d.propertyIndex);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
int index = uniqueBlocks.indexOf(b);
|
2011-04-27 10:05:43 +00:00
|
|
|
if (index == -1) {
|
2012-06-17 01:20:59 +00:00
|
|
|
index = uniqueBlocks.size();
|
2011-04-27 10:05:43 +00:00
|
|
|
b.index = used;
|
|
|
|
used += SMP_BLOCKSIZE;
|
2012-06-17 01:20:59 +00:00
|
|
|
uniqueBlocks.append(b);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
blockMap.append(uniqueBlocks.at(index).index);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
int smp_blocks = uniqueBlocks.size() - bmp_blocks;
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-06-17 01:20:59 +00:00
|
|
|
int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*sizeof(unsigned short);
|
|
|
|
int bmp_trie = BMP_END/BMP_BLOCKSIZE*sizeof(unsigned short);
|
2011-04-27 10:05:43 +00:00
|
|
|
int bmp_mem = bmp_block_data + bmp_trie;
|
2012-06-17 01:20:59 +00:00
|
|
|
qDebug(" %d unique blocks in BMP.", bmp_blocks);
|
2011-04-27 10:05:43 +00:00
|
|
|
qDebug(" block data uses: %d bytes", bmp_block_data);
|
|
|
|
qDebug(" trie data uses : %d bytes", bmp_trie);
|
|
|
|
|
2012-06-17 01:20:59 +00:00
|
|
|
int smp_block_data = smp_blocks*SMP_BLOCKSIZE*sizeof(unsigned short);
|
|
|
|
int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*sizeof(unsigned short);
|
2011-04-27 10:05:43 +00:00
|
|
|
int smp_mem = smp_block_data + smp_trie;
|
2012-06-17 01:20:59 +00:00
|
|
|
qDebug(" %d unique blocks in SMP.", smp_blocks);
|
2011-04-27 10:05:43 +00:00
|
|
|
qDebug(" block data uses: %d bytes", smp_block_data);
|
|
|
|
qDebug(" trie data uses : %d bytes", smp_trie);
|
|
|
|
|
2012-06-17 01:20:59 +00:00
|
|
|
int prop_data = uniqueProperties.size() * SizeOfPropertiesStruct;
|
|
|
|
qDebug("\n properties data uses : %d bytes", prop_data);
|
|
|
|
qDebug(" memory usage: %d bytes", bmp_mem + smp_mem + prop_data);
|
|
|
|
|
2020-08-12 12:17:50 +00:00
|
|
|
Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE +(SMP_END-BMP_END)/SMP_BLOCKSIZE); // 0x1870
|
2012-06-17 01:20:59 +00:00
|
|
|
Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2025-08-15 17:49:56 +00:00
|
|
|
QByteArray out;
|
|
|
|
out += "static constexpr char32_t MaxSeparatorCodepoint = 0x";
|
|
|
|
out += QByteArray::number(maxSeparatorCodepoint, 16);
|
|
|
|
out += ";\n";
|
|
|
|
|
|
|
|
out += "\nstatic constexpr unsigned short uc_property_trie[] = {\n";
|
2020-08-12 12:17:50 +00:00
|
|
|
// First write the map from blockId to indices of unique blocks:
|
2015-11-06 20:31:43 +00:00
|
|
|
out += " // [0x0..0x" + QByteArray::number(BMP_END, 16) + ")";
|
2011-04-27 10:05:43 +00:00
|
|
|
for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
|
|
|
|
if (!(i % 8)) {
|
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
if (!((i*BMP_BLOCKSIZE) % 0x1000))
|
|
|
|
out += "\n";
|
|
|
|
out += "\n ";
|
|
|
|
}
|
|
|
|
out += QByteArray::number(blockMap.at(i) + blockMap.size());
|
|
|
|
out += ", ";
|
|
|
|
}
|
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
2015-11-06 20:31:43 +00:00
|
|
|
out += "\n\n // [0x" + QByteArray::number(BMP_END, 16) + "..0x" + QByteArray::number(SMP_END, 16) + ")\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
|
|
|
|
if (!(i % 8)) {
|
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
if (!(i % (0x10000/SMP_BLOCKSIZE)))
|
|
|
|
out += "\n";
|
|
|
|
out += "\n ";
|
|
|
|
}
|
|
|
|
out += QByteArray::number(blockMap.at(i) + blockMap.size());
|
|
|
|
out += ", ";
|
|
|
|
}
|
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
out += "\n";
|
2020-08-12 12:17:50 +00:00
|
|
|
// Then write the contents of the unique blocks, at the anticipated indices.
|
|
|
|
// Each unique block is a list of UnicodeData::propertyIndex values, whch
|
|
|
|
// are indices into the uc_properties table.
|
2012-06-17 01:20:59 +00:00
|
|
|
for (int i = 0; i < uniqueBlocks.size(); ++i) {
|
2011-04-27 10:05:43 +00:00
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
out += "\n";
|
2012-06-17 01:20:59 +00:00
|
|
|
const UniqueBlock &b = uniqueBlocks.at(i);
|
|
|
|
for (int j = 0; j < b.values.size(); ++j) {
|
2011-04-27 10:05:43 +00:00
|
|
|
if (!(j % 8)) {
|
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
out += "\n ";
|
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
out += QByteArray::number(b.values.at(j));
|
2011-04-27 10:05:43 +00:00
|
|
|
out += ", ";
|
|
|
|
}
|
|
|
|
}
|
2015-11-06 20:31:43 +00:00
|
|
|
if (out.endsWith(", "))
|
|
|
|
out.chop(2);
|
2012-06-17 01:20:59 +00:00
|
|
|
out += "\n};\n\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2022-05-24 02:16:12 +00:00
|
|
|
out += "static constexpr Properties uc_properties[] = {";
|
2011-04-27 10:05:43 +00:00
|
|
|
// keep in sync with the property declaration
|
|
|
|
for (int i = 0; i < uniqueProperties.size(); ++i) {
|
2012-05-08 01:43:16 +00:00
|
|
|
const PropertyFlags &p = uniqueProperties.at(i);
|
2012-04-23 03:00:16 +00:00
|
|
|
out += "\n { ";
|
2024-03-19 07:59:18 +00:00
|
|
|
// " ushort category : 5;\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
out += QByteArray::number( p.category );
|
|
|
|
out += ", ";
|
2024-03-19 07:59:18 +00:00
|
|
|
// " ushort direction : 5;\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
out += QByteArray::number( p.direction );
|
|
|
|
out += ", ";
|
2024-03-19 07:59:18 +00:00
|
|
|
// " ushort emojiFlags : 6; /* 5 used */\n"
|
|
|
|
out += QByteArray::number ( p.emojiFlags );
|
|
|
|
out += ", ";
|
2012-06-17 01:20:59 +00:00
|
|
|
// " ushort combiningClass : 8;\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
out += QByteArray::number( p.combiningClass );
|
|
|
|
out += ", ";
|
2014-01-26 00:42:37 +00:00
|
|
|
// " ushort joining : 3;\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
out += QByteArray::number( p.joining );
|
|
|
|
out += ", ";
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
// " signed short digitValue : 5;\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
out += QByteArray::number( p.digitValue );
|
|
|
|
out += ", ";
|
2012-06-11 12:19:24 +00:00
|
|
|
// " signed short mirrorDiff : 16;\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
out += QByteArray::number( p.mirrorDiff );
|
|
|
|
out += ", ";
|
2022-05-06 11:44:58 +00:00
|
|
|
// " ushort unicodeVersion : 5; /* 5 used */\n"
|
2019-09-03 18:53:31 +00:00
|
|
|
out += QByteArray::number( p.age );
|
|
|
|
out += ", ";
|
2022-05-06 11:44:58 +00:00
|
|
|
// " ushort eastAsianWidth : 3;" /* 3 used */\n"
|
|
|
|
out += QByteArray::number( static_cast<unsigned int>(p.eastAsianWidth) );
|
|
|
|
out += ", ";
|
2019-09-03 18:53:31 +00:00
|
|
|
// " ushort nfQuickCheck : 8;\n"
|
|
|
|
out += QByteArray::number( p.nfQuickCheck );
|
|
|
|
out += ", ";
|
|
|
|
// " struct {\n"
|
|
|
|
// " ushort special : 1;\n"
|
|
|
|
// " signed short diff : 15;\n"
|
|
|
|
// " } cases[NumCases];\n"
|
2024-03-19 07:59:18 +00:00
|
|
|
out += "{ {";
|
2012-06-11 12:19:24 +00:00
|
|
|
out += QByteArray::number( p.lowerCaseSpecial );
|
|
|
|
out += ", ";
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
out += QByteArray::number( p.lowerCaseDiff );
|
2019-09-03 18:53:31 +00:00
|
|
|
out += "}, {";
|
2012-06-11 12:19:24 +00:00
|
|
|
out += QByteArray::number( p.upperCaseSpecial );
|
|
|
|
out += ", ";
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
out += QByteArray::number( p.upperCaseDiff );
|
2019-09-03 18:53:31 +00:00
|
|
|
out += "}, {";
|
2012-06-11 12:19:24 +00:00
|
|
|
out += QByteArray::number( p.titleCaseSpecial );
|
|
|
|
out += ", ";
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
out += QByteArray::number( p.titleCaseDiff );
|
2019-09-03 18:53:31 +00:00
|
|
|
out += "}, {";
|
2012-06-11 12:19:24 +00:00
|
|
|
out += QByteArray::number( p.caseFoldSpecial );
|
|
|
|
out += ", ";
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
out += QByteArray::number( p.caseFoldDiff );
|
2019-09-03 18:53:31 +00:00
|
|
|
out += "} }, ";
|
2017-12-12 09:14:28 +00:00
|
|
|
// " ushort graphemeBreakClass : 5; /* 5 used */\n"
|
|
|
|
// " ushort wordBreakClass : 5; /* 5 used */\n"
|
|
|
|
// " ushort lineBreakClass : 6; /* 6 used */\n"
|
2012-06-17 01:55:07 +00:00
|
|
|
out += QByteArray::number( p.graphemeBreakClass );
|
2011-04-27 10:05:43 +00:00
|
|
|
out += ", ";
|
2012-06-17 01:55:07 +00:00
|
|
|
out += QByteArray::number( p.wordBreakClass );
|
2011-04-27 10:05:43 +00:00
|
|
|
out += ", ";
|
2012-06-17 01:55:07 +00:00
|
|
|
out += QByteArray::number( p.lineBreakClass );
|
2012-06-11 12:19:24 +00:00
|
|
|
out += ", ";
|
2021-07-30 10:09:46 +00:00
|
|
|
// " ushort sentenceBreakClass : 4; /* 4 used */\n"
|
2019-09-03 21:40:56 +00:00
|
|
|
out += QByteArray::number( p.sentenceBreakClass );
|
|
|
|
out += ", ";
|
2021-07-30 10:09:46 +00:00
|
|
|
// " ushort idnaStatus : 4; /* 3 used */\n"
|
|
|
|
out += QByteArray::number( static_cast<unsigned int>(p.idnaStatus) );
|
|
|
|
out += ", ";
|
Update Unicode data up to v7.0
* Two newly adopted currency symbols:
the Azerbaijan manat and the Russia ruble
* Pictographic symbols (including many emoji), geometric symbols,
arrows, and ornaments originating from the Wingdings and Webdings sets
* Twenty-three new lesser-used and historic scripts
extending support for written languages of North America, China, India,
other Asian countries, and Africa
* Letters used in Teuthonista and other transcriptional systems,
and a new notational set, Duployan
For more details, see http://www.unicode.org/versions/Unicode7.0.0/
The Properties struct's .*Diff members were narrowed down
to signed 15 bits and the unicodeVersion has been expanded to 8 bits.
[ChangeLog][QtCore] Unicode data updated to v.7.0
Change-Id: I93ab6f79fa3b05f61abc7279f1d046834c1c1a0b
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2015-03-23 21:01:06 +00:00
|
|
|
// " ushort script : 8;\n"
|
2012-06-11 12:19:24 +00:00
|
|
|
out += QByteArray::number( p.script );
|
2012-04-23 03:00:16 +00:00
|
|
|
out += " },";
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2015-11-06 20:31:43 +00:00
|
|
|
if (out.endsWith(','))
|
|
|
|
out.chop(1);
|
2012-04-23 03:00:16 +00:00
|
|
|
out += "\n};\n\n";
|
|
|
|
|
2020-04-23 19:10:48 +00:00
|
|
|
out += "Q_DECL_CONST_FUNCTION static inline const Properties *qGetProp(char32_t ucs4) noexcept\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
"{\n"
|
2020-07-31 12:55:57 +00:00
|
|
|
" Q_ASSERT(ucs4 <= QChar::LastValidCodePoint);\n"
|
2020-07-31 13:47:07 +00:00
|
|
|
" if (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + ")\n"
|
|
|
|
" return uc_properties + uc_property_trie[uc_property_trie[ucs4 >> "
|
|
|
|
+ QByteArray::number(BMP_SHIFT) + "] + (ucs4 & 0x"
|
|
|
|
+ QByteArray::number(BMP_BLOCKSIZE - 1, 16)+ ")];\n"
|
|
|
|
"\n"
|
|
|
|
" return uc_properties\n"
|
|
|
|
" + uc_property_trie[uc_property_trie[((ucs4 - 0x"
|
|
|
|
+ QByteArray::number(BMP_END, 16) + ") >> "
|
|
|
|
+ QByteArray::number(SMP_SHIFT) + ") + 0x"
|
|
|
|
+ QByteArray::number(BMP_END / BMP_BLOCKSIZE, 16) + "] + (ucs4 & 0x"
|
|
|
|
+ QByteArray::number(SMP_BLOCKSIZE - 1, 16) + ")];\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
"}\n"
|
|
|
|
"\n"
|
2020-04-22 10:50:55 +00:00
|
|
|
"Q_DECL_CONST_FUNCTION static inline const Properties *qGetProp(char16_t ucs2) noexcept\n"
|
|
|
|
"{\n"
|
2020-07-31 13:47:07 +00:00
|
|
|
" return uc_properties + uc_property_trie[uc_property_trie[ucs2 >> "
|
|
|
|
+ QByteArray::number(BMP_SHIFT) + "] + (ucs2 & 0x"
|
|
|
|
+ QByteArray::number(BMP_BLOCKSIZE - 1, 16) + ")];\n"
|
2020-04-22 10:50:55 +00:00
|
|
|
"}\n"
|
|
|
|
"\n"
|
2025-08-25 10:20:50 +00:00
|
|
|
"const Properties * QT_FASTCALL properties(char32_t ucs4) noexcept\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
"{\n"
|
2012-04-23 03:00:16 +00:00
|
|
|
" return qGetProp(ucs4);\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
"}\n"
|
|
|
|
"\n"
|
2025-08-25 10:20:50 +00:00
|
|
|
"const Properties * QT_FASTCALL properties(char16_t ucs2) noexcept\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
"{\n"
|
2012-04-23 03:00:16 +00:00
|
|
|
" return qGetProp(ucs2);\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
"}\n\n";
|
|
|
|
|
2020-04-23 19:10:48 +00:00
|
|
|
out += "Q_CORE_EXPORT GraphemeBreakClass QT_FASTCALL graphemeBreakClass(char32_t ucs4) noexcept\n"
|
2012-05-09 13:44:36 +00:00
|
|
|
"{\n"
|
2015-11-06 20:31:43 +00:00
|
|
|
" return static_cast<GraphemeBreakClass>(qGetProp(ucs4)->graphemeBreakClass);\n"
|
2012-05-09 13:44:36 +00:00
|
|
|
"}\n"
|
|
|
|
"\n"
|
2020-04-23 19:10:48 +00:00
|
|
|
"Q_CORE_EXPORT WordBreakClass QT_FASTCALL wordBreakClass(char32_t ucs4) noexcept\n"
|
2012-05-09 13:44:36 +00:00
|
|
|
"{\n"
|
2015-11-06 20:31:43 +00:00
|
|
|
" return static_cast<WordBreakClass>(qGetProp(ucs4)->wordBreakClass);\n"
|
2012-05-09 13:44:36 +00:00
|
|
|
"}\n"
|
|
|
|
"\n"
|
2020-04-23 19:10:48 +00:00
|
|
|
"Q_CORE_EXPORT SentenceBreakClass QT_FASTCALL sentenceBreakClass(char32_t ucs4) noexcept\n"
|
2012-05-09 13:44:36 +00:00
|
|
|
"{\n"
|
2015-11-06 20:31:43 +00:00
|
|
|
" return static_cast<SentenceBreakClass>(qGetProp(ucs4)->sentenceBreakClass);\n"
|
2012-05-09 13:44:36 +00:00
|
|
|
"}\n"
|
|
|
|
"\n"
|
2020-04-23 19:10:48 +00:00
|
|
|
"Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(char32_t ucs4) noexcept\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
"{\n"
|
2015-11-06 20:31:43 +00:00
|
|
|
" return static_cast<LineBreakClass>(qGetProp(ucs4)->lineBreakClass);\n"
|
2012-06-11 12:19:24 +00:00
|
|
|
"}\n"
|
2021-07-30 10:09:46 +00:00
|
|
|
"\n"
|
|
|
|
"Q_CORE_EXPORT IdnaStatus QT_FASTCALL idnaStatus(char32_t ucs4) noexcept\n"
|
|
|
|
"{\n"
|
|
|
|
" return static_cast<IdnaStatus>(qGetProp(ucs4)->idnaStatus);\n"
|
|
|
|
"}\n"
|
2022-05-06 11:44:58 +00:00
|
|
|
"\n"
|
|
|
|
"Q_CORE_EXPORT EastAsianWidth QT_FASTCALL eastAsianWidth(char32_t ucs4) noexcept\n"
|
|
|
|
"{\n"
|
|
|
|
" return static_cast<EastAsianWidth>(qGetProp(ucs4)->eastAsianWidth);\n"
|
|
|
|
"}\n"
|
2012-12-08 03:36:49 +00:00
|
|
|
"\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-04-23 03:00:16 +00:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
static QByteArray createSpecialCaseMap()
|
|
|
|
{
|
|
|
|
qDebug("createSpecialCaseMap:");
|
2011-10-18 17:12:20 +00:00
|
|
|
|
2020-07-31 13:20:48 +00:00
|
|
|
QByteArray out
|
2022-05-24 02:16:12 +00:00
|
|
|
= "static constexpr unsigned short specialCaseMap[] = {\n"
|
2012-04-23 03:00:16 +00:00
|
|
|
" 0x0, // placeholder";
|
2020-07-31 13:20:48 +00:00
|
|
|
|
2011-10-18 17:12:21 +00:00
|
|
|
int i = 1;
|
2020-04-24 10:54:20 +00:00
|
|
|
int maxN = 0;
|
2011-10-18 17:12:20 +00:00
|
|
|
while (i < specialCaseMap.size()) {
|
|
|
|
out += "\n ";
|
|
|
|
int n = specialCaseMap.at(i);
|
2012-04-23 03:00:16 +00:00
|
|
|
for (int j = 0; j <= n; ++j) {
|
2011-10-18 17:12:20 +00:00
|
|
|
out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i+j), 16);
|
2012-04-23 03:00:16 +00:00
|
|
|
out += ",";
|
2011-10-18 17:12:20 +00:00
|
|
|
}
|
|
|
|
i += n + 1;
|
2020-04-24 10:54:20 +00:00
|
|
|
maxN = std::max(maxN, n);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2012-04-23 03:00:16 +00:00
|
|
|
out.chop(1);
|
2022-05-24 02:16:12 +00:00
|
|
|
out += "\n};\n\nconstexpr unsigned int MaxSpecialCaseLength = ";
|
2020-04-24 10:54:20 +00:00
|
|
|
out += QByteArray::number(maxN);
|
2020-07-31 13:20:48 +00:00
|
|
|
out += ";\n\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2025-08-25 07:25:14 +00:00
|
|
|
qDebug(" memory usage: %llu bytes",
|
|
|
|
qulonglong{specialCaseMap.size() * sizeof(unsigned short)});
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static QByteArray createCompositionInfo()
|
|
|
|
{
|
2012-04-23 03:00:16 +00:00
|
|
|
qDebug("createCompositionInfo: highestComposedCharacter=0x%x", highestComposedCharacter);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
const int BMP_BLOCKSIZE = 16;
|
|
|
|
const int BMP_SHIFT = 4;
|
|
|
|
const int BMP_END = 0x3400; // start of Han
|
|
|
|
const int SMP_END = 0x30000;
|
|
|
|
const int SMP_BLOCKSIZE = 256;
|
|
|
|
const int SMP_SHIFT = 8;
|
|
|
|
|
|
|
|
if (SMP_END <= highestComposedCharacter)
|
2012-04-23 03:00:16 +00:00
|
|
|
qFatal("end of table smaller than highest composed character 0x%x", highestComposedCharacter);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2020-07-07 10:04:21 +00:00
|
|
|
QList<unsigned short> decompositions;
|
2012-06-17 01:20:59 +00:00
|
|
|
int tableIndex = 0;
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-06-17 01:20:59 +00:00
|
|
|
QList<UniqueBlock> uniqueBlocks;
|
2020-07-07 10:04:21 +00:00
|
|
|
QList<int> blockMap;
|
2011-04-27 10:05:43 +00:00
|
|
|
int used = 0;
|
|
|
|
|
|
|
|
for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
|
2012-06-17 01:20:59 +00:00
|
|
|
UniqueBlock b;
|
|
|
|
b.values.reserve(BMP_BLOCKSIZE);
|
2011-04-27 10:05:43 +00:00
|
|
|
for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
|
|
|
|
int uc = block*BMP_BLOCKSIZE + i;
|
2012-05-08 01:43:16 +00:00
|
|
|
UnicodeData &d = UnicodeData::valueRef(uc);
|
2011-04-27 10:05:43 +00:00
|
|
|
if (!d.decomposition.isEmpty()) {
|
2012-04-23 03:00:16 +00:00
|
|
|
int utf16Length = 0;
|
|
|
|
decompositions.append(0);
|
2011-04-27 10:05:43 +00:00
|
|
|
for (int j = 0; j < d.decomposition.size(); ++j) {
|
|
|
|
int code = d.decomposition.at(j);
|
2012-04-08 07:18:45 +00:00
|
|
|
if (QChar::requiresSurrogates(code)) {
|
2011-04-27 10:05:43 +00:00
|
|
|
// save as surrogate pair
|
2012-04-23 03:00:16 +00:00
|
|
|
decompositions.append(QChar::highSurrogate(code));
|
|
|
|
decompositions.append(QChar::lowSurrogate(code));
|
|
|
|
utf16Length += 2;
|
2011-04-27 10:05:43 +00:00
|
|
|
} else {
|
|
|
|
decompositions.append(code);
|
2012-04-23 03:00:16 +00:00
|
|
|
utf16Length++;
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
}
|
2012-04-23 03:00:16 +00:00
|
|
|
decompositions[tableIndex] = d.decompositionType + (utf16Length<<8);
|
2012-06-17 01:20:59 +00:00
|
|
|
b.values.append(tableIndex);
|
2012-04-23 03:00:16 +00:00
|
|
|
tableIndex += utf16Length + 1;
|
2011-04-27 10:05:43 +00:00
|
|
|
} else {
|
2012-06-17 01:20:59 +00:00
|
|
|
b.values.append(0xffff);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
int index = uniqueBlocks.indexOf(b);
|
2011-04-27 10:05:43 +00:00
|
|
|
if (index == -1) {
|
2012-06-17 01:20:59 +00:00
|
|
|
index = uniqueBlocks.size();
|
2011-04-27 10:05:43 +00:00
|
|
|
b.index = used;
|
|
|
|
used += BMP_BLOCKSIZE;
|
2012-06-17 01:20:59 +00:00
|
|
|
uniqueBlocks.append(b);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
blockMap.append(uniqueBlocks.at(index).index);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
int bmp_blocks = uniqueBlocks.size();
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
|
2012-06-17 01:20:59 +00:00
|
|
|
UniqueBlock b;
|
|
|
|
b.values.reserve(SMP_BLOCKSIZE);
|
2011-04-27 10:05:43 +00:00
|
|
|
for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
|
|
|
|
int uc = block*SMP_BLOCKSIZE + i;
|
2012-05-08 01:43:16 +00:00
|
|
|
UnicodeData &d = UnicodeData::valueRef(uc);
|
2011-04-27 10:05:43 +00:00
|
|
|
if (!d.decomposition.isEmpty()) {
|
2012-04-23 03:00:16 +00:00
|
|
|
int utf16Length = 0;
|
|
|
|
decompositions.append(0);
|
2011-04-27 10:05:43 +00:00
|
|
|
for (int j = 0; j < d.decomposition.size(); ++j) {
|
|
|
|
int code = d.decomposition.at(j);
|
2012-04-08 07:18:45 +00:00
|
|
|
if (QChar::requiresSurrogates(code)) {
|
2011-04-27 10:05:43 +00:00
|
|
|
// save as surrogate pair
|
2012-04-23 03:00:16 +00:00
|
|
|
decompositions.append(QChar::highSurrogate(code));
|
|
|
|
decompositions.append(QChar::lowSurrogate(code));
|
|
|
|
utf16Length += 2;
|
2011-04-27 10:05:43 +00:00
|
|
|
} else {
|
|
|
|
decompositions.append(code);
|
2012-04-23 03:00:16 +00:00
|
|
|
utf16Length++;
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
}
|
2012-04-23 03:00:16 +00:00
|
|
|
decompositions[tableIndex] = d.decompositionType + (utf16Length<<8);
|
2012-06-17 01:20:59 +00:00
|
|
|
b.values.append(tableIndex);
|
2012-04-23 03:00:16 +00:00
|
|
|
tableIndex += utf16Length + 1;
|
2011-04-27 10:05:43 +00:00
|
|
|
} else {
|
2012-06-17 01:20:59 +00:00
|
|
|
b.values.append(0xffff);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
int index = uniqueBlocks.indexOf(b);
|
2011-04-27 10:05:43 +00:00
|
|
|
if (index == -1) {
|
2012-06-17 01:20:59 +00:00
|
|
|
index = uniqueBlocks.size();
|
2011-04-27 10:05:43 +00:00
|
|
|
b.index = used;
|
|
|
|
used += SMP_BLOCKSIZE;
|
2012-06-17 01:20:59 +00:00
|
|
|
uniqueBlocks.append(b);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
blockMap.append(uniqueBlocks.at(index).index);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
int smp_blocks = uniqueBlocks.size() - bmp_blocks;
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-04-23 03:00:16 +00:00
|
|
|
// if the condition below doesn't hold anymore we need to modify our decomposition code
|
|
|
|
Q_ASSERT(tableIndex < 0xffff);
|
|
|
|
|
2012-06-17 01:20:59 +00:00
|
|
|
int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*sizeof(unsigned short);
|
|
|
|
int bmp_trie = BMP_END/BMP_BLOCKSIZE*sizeof(unsigned short);
|
2011-04-27 10:05:43 +00:00
|
|
|
int bmp_mem = bmp_block_data + bmp_trie;
|
2012-06-17 01:20:59 +00:00
|
|
|
qDebug(" %d unique blocks in BMP.", bmp_blocks);
|
2011-04-27 10:05:43 +00:00
|
|
|
qDebug(" block data uses: %d bytes", bmp_block_data);
|
|
|
|
qDebug(" trie data uses : %d bytes", bmp_trie);
|
|
|
|
|
2012-06-17 01:20:59 +00:00
|
|
|
int smp_block_data = smp_blocks*SMP_BLOCKSIZE*sizeof(unsigned short);
|
|
|
|
int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*sizeof(unsigned short);
|
2011-04-27 10:05:43 +00:00
|
|
|
int smp_mem = smp_block_data + smp_trie;
|
2012-06-17 01:20:59 +00:00
|
|
|
qDebug(" %d unique blocks in SMP.", smp_blocks);
|
2011-04-27 10:05:43 +00:00
|
|
|
qDebug(" block data uses: %d bytes", smp_block_data);
|
|
|
|
qDebug(" trie data uses : %d bytes", smp_trie);
|
|
|
|
|
2012-06-17 01:20:59 +00:00
|
|
|
int decomposition_data = decompositions.size() * 2;
|
|
|
|
qDebug("\n decomposition data uses : %d bytes", decomposition_data);
|
|
|
|
qDebug(" memory usage: %d bytes", bmp_mem + smp_mem + decomposition_data);
|
|
|
|
|
|
|
|
Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2022-05-24 02:16:12 +00:00
|
|
|
QByteArray out = "static constexpr unsigned short uc_decomposition_trie[] = {\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
// first write the map
|
|
|
|
out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
|
|
|
|
for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
|
|
|
|
if (!(i % 8)) {
|
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
if (!((i*BMP_BLOCKSIZE) % 0x1000))
|
|
|
|
out += "\n";
|
|
|
|
out += "\n ";
|
|
|
|
}
|
|
|
|
out += QByteArray::number(blockMap.at(i) + blockMap.size());
|
|
|
|
out += ", ";
|
|
|
|
}
|
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
2012-04-23 03:00:16 +00:00
|
|
|
out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
|
|
|
|
if (!(i % 8)) {
|
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
if (!(i % (0x10000/SMP_BLOCKSIZE)))
|
|
|
|
out += "\n";
|
|
|
|
out += "\n ";
|
|
|
|
}
|
|
|
|
out += QByteArray::number(blockMap.at(i) + blockMap.size());
|
|
|
|
out += ", ";
|
|
|
|
}
|
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
out += "\n";
|
|
|
|
// write the data
|
2012-06-17 01:20:59 +00:00
|
|
|
for (int i = 0; i < uniqueBlocks.size(); ++i) {
|
2011-04-27 10:05:43 +00:00
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
out += "\n";
|
2012-06-17 01:20:59 +00:00
|
|
|
const UniqueBlock &b = uniqueBlocks.at(i);
|
|
|
|
for (int j = 0; j < b.values.size(); ++j) {
|
2011-04-27 10:05:43 +00:00
|
|
|
if (!(j % 8)) {
|
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
out += "\n ";
|
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
out += "0x" + QByteArray::number(b.values.at(j), 16);
|
2011-04-27 10:05:43 +00:00
|
|
|
out += ", ";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (out.endsWith(' '))
|
2012-04-23 03:00:16 +00:00
|
|
|
out.chop(2);
|
2012-06-17 01:20:59 +00:00
|
|
|
out += "\n};\n\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-06-17 01:20:59 +00:00
|
|
|
out += "#define GET_DECOMPOSITION_INDEX(ucs4) \\\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
" (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
|
2020-07-31 13:20:48 +00:00
|
|
|
" ? (uc_decomposition_trie[uc_decomposition_trie[ucs4 >> "
|
|
|
|
+ QByteArray::number(BMP_SHIFT) + "] + (ucs4 & 0x"
|
|
|
|
+ QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
|
|
|
|
" : ucs4 < 0x" + QByteArray::number(SMP_END, 16) + " \\\n"
|
|
|
|
" ? uc_decomposition_trie[uc_decomposition_trie[((ucs4 - 0x"
|
|
|
|
+ QByteArray::number(BMP_END, 16) + ") >> "
|
|
|
|
+ QByteArray::number(SMP_SHIFT) + ") + 0x"
|
|
|
|
+ QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "] + (ucs4 & 0x"
|
|
|
|
+ QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")] \\\n"
|
|
|
|
" : 0xffff)\n\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2022-05-24 02:16:12 +00:00
|
|
|
out += "static constexpr unsigned short uc_decomposition_map[] = {";
|
2011-04-27 10:05:43 +00:00
|
|
|
for (int i = 0; i < decompositions.size(); ++i) {
|
|
|
|
if (!(i % 8)) {
|
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
out += "\n ";
|
|
|
|
}
|
|
|
|
out += "0x" + QByteArray::number(decompositions.at(i), 16);
|
|
|
|
out += ", ";
|
|
|
|
}
|
|
|
|
if (out.endsWith(' '))
|
2012-04-23 03:00:16 +00:00
|
|
|
out.chop(2);
|
2011-04-27 10:05:43 +00:00
|
|
|
out += "\n};\n\n";
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
static QByteArray createLigatureInfo()
|
|
|
|
{
|
2012-04-23 03:00:16 +00:00
|
|
|
qDebug("createLigatureInfo: numLigatures=%d, highestLigature=0x%x", numLigatures, highestLigature);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-05-08 01:43:16 +00:00
|
|
|
for (int i = 0; i < ligatureHashes.size(); ++i) {
|
|
|
|
const QList<Ligature> &l = ligatureHashes.value(i);
|
2012-04-26 16:29:08 +00:00
|
|
|
for (int j = 0; j < l.size(); ++j) {
|
|
|
|
// if the condition below doesn't hold anymore we need to modify our ligatureHelper code
|
|
|
|
Q_ASSERT(QChar::requiresSurrogates(l.at(j).u2) == QChar::requiresSurrogates(l.at(j).ligature) &&
|
|
|
|
QChar::requiresSurrogates(l.at(j).u2) == QChar::requiresSurrogates(l.at(j).u1));
|
|
|
|
}
|
|
|
|
}
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
const int BMP_BLOCKSIZE = 32;
|
|
|
|
const int BMP_SHIFT = 5;
|
|
|
|
const int BMP_END = 0x3100;
|
2025-01-17 12:03:50 +00:00
|
|
|
const int SMP_END = 0x1FC00; // https://www.unicode.org/roadmaps/smp/
|
2012-04-26 16:29:08 +00:00
|
|
|
const int SMP_BLOCKSIZE = 256;
|
|
|
|
const int SMP_SHIFT = 8;
|
2012-04-23 03:00:16 +00:00
|
|
|
|
2012-04-26 16:29:08 +00:00
|
|
|
if (SMP_END <= highestLigature)
|
2012-04-23 03:00:16 +00:00
|
|
|
qFatal("end of table smaller than highest ligature character 0x%x", highestLigature);
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-04-26 16:29:08 +00:00
|
|
|
QList<unsigned short> ligatures;
|
2012-06-17 01:20:59 +00:00
|
|
|
int tableIndex = 0;
|
2012-04-26 16:29:08 +00:00
|
|
|
|
2012-06-17 01:20:59 +00:00
|
|
|
QList<UniqueBlock> uniqueBlocks;
|
2020-07-07 10:04:21 +00:00
|
|
|
QList<int> blockMap;
|
2011-04-27 10:05:43 +00:00
|
|
|
int used = 0;
|
|
|
|
|
|
|
|
for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
|
2012-06-17 01:20:59 +00:00
|
|
|
UniqueBlock b;
|
|
|
|
b.values.reserve(BMP_BLOCKSIZE);
|
2011-04-27 10:05:43 +00:00
|
|
|
for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
|
|
|
|
int uc = block*BMP_BLOCKSIZE + i;
|
|
|
|
QList<Ligature> l = ligatureHashes.value(uc);
|
|
|
|
if (!l.isEmpty()) {
|
2012-04-23 03:00:16 +00:00
|
|
|
Q_ASSERT(!QChar::requiresSurrogates(uc));
|
2019-06-20 15:40:45 +00:00
|
|
|
std::sort(l.begin(), l.end()); // needed for bsearch in ligatureHelper code
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
ligatures.append(l.size());
|
|
|
|
for (int j = 0; j < l.size(); ++j) {
|
|
|
|
ligatures.append(l.at(j).u1);
|
|
|
|
ligatures.append(l.at(j).ligature);
|
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
b.values.append(tableIndex);
|
2011-04-27 10:05:43 +00:00
|
|
|
tableIndex += 2*l.size() + 1;
|
|
|
|
} else {
|
2012-06-17 01:20:59 +00:00
|
|
|
b.values.append(0xffff);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
int index = uniqueBlocks.indexOf(b);
|
2011-04-27 10:05:43 +00:00
|
|
|
if (index == -1) {
|
2012-06-17 01:20:59 +00:00
|
|
|
index = uniqueBlocks.size();
|
2011-04-27 10:05:43 +00:00
|
|
|
b.index = used;
|
|
|
|
used += BMP_BLOCKSIZE;
|
2012-06-17 01:20:59 +00:00
|
|
|
uniqueBlocks.append(b);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
blockMap.append(uniqueBlocks.at(index).index);
|
2011-04-27 10:05:43 +00:00
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
int bmp_blocks = uniqueBlocks.size();
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-04-26 16:29:08 +00:00
|
|
|
for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
|
2012-06-17 01:20:59 +00:00
|
|
|
UniqueBlock b;
|
|
|
|
b.values.reserve(SMP_BLOCKSIZE);
|
2012-04-26 16:29:08 +00:00
|
|
|
for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
|
|
|
|
int uc = block*SMP_BLOCKSIZE + i;
|
|
|
|
QList<Ligature> l = ligatureHashes.value(uc);
|
|
|
|
if (!l.isEmpty()) {
|
|
|
|
Q_ASSERT(QChar::requiresSurrogates(uc));
|
2019-06-20 15:40:45 +00:00
|
|
|
std::sort(l.begin(), l.end()); // needed for bsearch in ligatureHelper code
|
2012-04-26 16:29:08 +00:00
|
|
|
|
|
|
|
ligatures.append(l.size());
|
|
|
|
for (int j = 0; j < l.size(); ++j) {
|
|
|
|
ligatures.append(QChar::highSurrogate(l.at(j).u1));
|
|
|
|
ligatures.append(QChar::lowSurrogate(l.at(j).u1));
|
|
|
|
ligatures.append(QChar::highSurrogate(l.at(j).ligature));
|
|
|
|
ligatures.append(QChar::lowSurrogate(l.at(j).ligature));
|
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
b.values.append(tableIndex);
|
2012-04-26 16:29:08 +00:00
|
|
|
tableIndex += 4*l.size() + 1;
|
|
|
|
} else {
|
2012-06-17 01:20:59 +00:00
|
|
|
b.values.append(0xffff);
|
2012-04-26 16:29:08 +00:00
|
|
|
}
|
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
int index = uniqueBlocks.indexOf(b);
|
2012-04-26 16:29:08 +00:00
|
|
|
if (index == -1) {
|
2012-06-17 01:20:59 +00:00
|
|
|
index = uniqueBlocks.size();
|
2012-04-26 16:29:08 +00:00
|
|
|
b.index = used;
|
|
|
|
used += SMP_BLOCKSIZE;
|
2012-06-17 01:20:59 +00:00
|
|
|
uniqueBlocks.append(b);
|
2012-04-26 16:29:08 +00:00
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
blockMap.append(uniqueBlocks.at(index).index);
|
2012-04-26 16:29:08 +00:00
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
int smp_blocks = uniqueBlocks.size() - bmp_blocks;
|
2012-04-26 16:29:08 +00:00
|
|
|
|
2012-04-23 03:00:16 +00:00
|
|
|
// if the condition below doesn't hold anymore we need to modify our composition code
|
|
|
|
Q_ASSERT(tableIndex < 0xffff);
|
|
|
|
|
2012-06-17 01:20:59 +00:00
|
|
|
int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*sizeof(unsigned short);
|
|
|
|
int bmp_trie = BMP_END/BMP_BLOCKSIZE*sizeof(unsigned short);
|
2011-04-27 10:05:43 +00:00
|
|
|
int bmp_mem = bmp_block_data + bmp_trie;
|
2012-06-17 01:20:59 +00:00
|
|
|
qDebug(" %d unique blocks in BMP.", bmp_blocks);
|
2011-04-27 10:05:43 +00:00
|
|
|
qDebug(" block data uses: %d bytes", bmp_block_data);
|
|
|
|
qDebug(" trie data uses : %d bytes", bmp_trie);
|
2012-04-26 16:29:08 +00:00
|
|
|
|
2012-06-17 01:20:59 +00:00
|
|
|
int smp_block_data = smp_blocks*SMP_BLOCKSIZE*sizeof(unsigned short);
|
|
|
|
int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*sizeof(unsigned short);
|
2012-04-26 16:29:08 +00:00
|
|
|
int smp_mem = smp_block_data + smp_trie;
|
2012-06-17 01:20:59 +00:00
|
|
|
qDebug(" %d unique blocks in SMP.", smp_blocks);
|
2012-04-26 16:29:08 +00:00
|
|
|
qDebug(" block data uses: %d bytes", smp_block_data);
|
|
|
|
qDebug(" trie data uses : %d bytes", smp_trie);
|
|
|
|
|
2012-06-17 01:20:59 +00:00
|
|
|
int ligature_data = ligatures.size() * 2;
|
|
|
|
qDebug("\n ligature data uses : %d bytes", ligature_data);
|
|
|
|
qDebug(" memory usage: %d bytes", bmp_mem + smp_mem + ligature_data);
|
|
|
|
|
|
|
|
Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2022-05-24 02:16:12 +00:00
|
|
|
QByteArray out = "static constexpr unsigned short uc_ligature_trie[] = {\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
// first write the map
|
|
|
|
out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
|
|
|
|
for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
|
|
|
|
if (!(i % 8)) {
|
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
if (!((i*BMP_BLOCKSIZE) % 0x1000))
|
|
|
|
out += "\n";
|
|
|
|
out += "\n ";
|
|
|
|
}
|
|
|
|
out += QByteArray::number(blockMap.at(i) + blockMap.size());
|
|
|
|
out += ", ";
|
|
|
|
}
|
2012-04-26 16:29:08 +00:00
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
|
|
|
|
for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
|
|
|
|
if (!(i % 8)) {
|
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
if (!(i % (0x10000/SMP_BLOCKSIZE)))
|
|
|
|
out += "\n";
|
|
|
|
out += "\n ";
|
|
|
|
}
|
|
|
|
out += QByteArray::number(blockMap.at(i) + blockMap.size());
|
|
|
|
out += ", ";
|
|
|
|
}
|
2011-04-27 10:05:43 +00:00
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
out += "\n";
|
|
|
|
// write the data
|
2012-06-17 01:20:59 +00:00
|
|
|
for (int i = 0; i < uniqueBlocks.size(); ++i) {
|
2011-04-27 10:05:43 +00:00
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
out += "\n";
|
2012-06-17 01:20:59 +00:00
|
|
|
const UniqueBlock &b = uniqueBlocks.at(i);
|
|
|
|
for (int j = 0; j < b.values.size(); ++j) {
|
2011-04-27 10:05:43 +00:00
|
|
|
if (!(j % 8)) {
|
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
out += "\n ";
|
|
|
|
}
|
2012-06-17 01:20:59 +00:00
|
|
|
out += "0x" + QByteArray::number(b.values.at(j), 16);
|
2011-04-27 10:05:43 +00:00
|
|
|
out += ", ";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (out.endsWith(' '))
|
2012-04-23 03:00:16 +00:00
|
|
|
out.chop(2);
|
2012-06-17 01:20:59 +00:00
|
|
|
out += "\n};\n\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-06-17 01:20:59 +00:00
|
|
|
out += "#define GET_LIGATURE_INDEX(ucs4) \\\n"
|
2012-04-26 16:29:08 +00:00
|
|
|
" (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
|
2020-07-31 13:20:48 +00:00
|
|
|
" ? (uc_ligature_trie[uc_ligature_trie[ucs4 >> "
|
|
|
|
+ QByteArray::number(BMP_SHIFT) + "] + (ucs4 & 0x"
|
|
|
|
+ QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
|
|
|
|
" : ucs4 < 0x" + QByteArray::number(SMP_END, 16) + " \\\n"
|
|
|
|
" ? uc_ligature_trie[uc_ligature_trie[((ucs4 - 0x"
|
|
|
|
+ QByteArray::number(BMP_END, 16) + ") >> "
|
|
|
|
+ QByteArray::number(SMP_SHIFT) + ") + 0x"
|
|
|
|
+ QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]" " + (ucs4 & 0x"
|
|
|
|
+ QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")] \\\n"
|
|
|
|
" : 0xffff)\n\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2022-05-24 02:16:12 +00:00
|
|
|
out += "static constexpr unsigned short uc_ligature_map[] = {";
|
2011-04-27 10:05:43 +00:00
|
|
|
for (int i = 0; i < ligatures.size(); ++i) {
|
|
|
|
if (!(i % 8)) {
|
|
|
|
if (out.endsWith(' '))
|
|
|
|
out.chop(1);
|
|
|
|
out += "\n ";
|
|
|
|
}
|
|
|
|
out += "0x" + QByteArray::number(ligatures.at(i), 16);
|
|
|
|
out += ", ";
|
|
|
|
}
|
|
|
|
if (out.endsWith(' '))
|
2012-04-23 03:00:16 +00:00
|
|
|
out.chop(2);
|
2020-07-31 13:20:48 +00:00
|
|
|
out += "\n};\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
QByteArray createCasingInfo()
|
|
|
|
{
|
2020-07-31 13:20:48 +00:00
|
|
|
QByteArray out
|
|
|
|
= "struct CasingInfo {\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
" uint codePoint : 16;\n"
|
|
|
|
" uint flags : 8;\n"
|
|
|
|
" uint offset : 8;\n"
|
|
|
|
"};\n\n";
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int main(int, char **)
|
|
|
|
{
|
|
|
|
initAgeMap();
|
2022-05-06 11:44:58 +00:00
|
|
|
initEastAsianWidthMap();
|
2011-04-27 10:05:43 +00:00
|
|
|
initCategoryMap();
|
|
|
|
initDecompositionMap();
|
|
|
|
initDirectionMap();
|
|
|
|
initJoiningMap();
|
|
|
|
initGraphemeBreak();
|
|
|
|
initWordBreak();
|
|
|
|
initSentenceBreak();
|
|
|
|
initLineBreak();
|
2012-12-08 03:36:49 +00:00
|
|
|
initScriptMap();
|
2021-07-30 10:09:46 +00:00
|
|
|
initIdnaStatusMap();
|
2024-03-19 07:59:18 +00:00
|
|
|
initEmojiFlagsMap();
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
readUnicodeData();
|
|
|
|
readBidiMirroring();
|
|
|
|
readArabicShaping();
|
|
|
|
readDerivedAge();
|
2022-05-06 11:44:58 +00:00
|
|
|
readEastAsianWidth();
|
2011-04-27 10:05:43 +00:00
|
|
|
readDerivedNormalizationProps();
|
|
|
|
readSpecialCasing();
|
|
|
|
readCaseFolding();
|
|
|
|
// readBlocks();
|
|
|
|
readScripts();
|
|
|
|
readGraphemeBreak();
|
2021-04-15 12:39:51 +00:00
|
|
|
readEmojiData();
|
2011-04-27 10:05:43 +00:00
|
|
|
readWordBreak();
|
|
|
|
readSentenceBreak();
|
|
|
|
readLineBreak();
|
2021-07-30 10:09:46 +00:00
|
|
|
readIdnaMappingTable();
|
|
|
|
|
|
|
|
resolveIdnaStatus();
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
computeUniqueProperties();
|
|
|
|
QByteArray properties = createPropertyInfo();
|
2012-04-23 03:00:16 +00:00
|
|
|
QByteArray specialCases = createSpecialCaseMap();
|
2011-04-27 10:05:43 +00:00
|
|
|
QByteArray compositions = createCompositionInfo();
|
|
|
|
QByteArray ligatures = createLigatureInfo();
|
|
|
|
QByteArray normalizationCorrections = createNormalizationCorrections();
|
2021-07-30 10:09:46 +00:00
|
|
|
QByteArray idnaMapping = createIdnaMapping();
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
QByteArray header =
|
2022-05-10 10:06:48 +00:00
|
|
|
"// Copyright (C) 2020 The Qt Company Ltd.\n"
|
2024-03-05 09:50:07 +00:00
|
|
|
"// SPDX-License-Identifier: Unicode-3.0\n"
|
2022-05-10 10:06:48 +00:00
|
|
|
"\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
QByteArray note =
|
2017-12-11 10:28:11 +00:00
|
|
|
"/* This file is autogenerated from the Unicode " DATA_VERSION_S " database. Do not edit */\n\n";
|
2011-04-27 10:05:43 +00:00
|
|
|
|
|
|
|
QByteArray warning =
|
|
|
|
"//\n"
|
|
|
|
"// W A R N I N G\n"
|
|
|
|
"// -------------\n"
|
|
|
|
"//\n"
|
|
|
|
"// This file is not part of the Qt API. It exists for the convenience\n"
|
|
|
|
"// of internal files. This header file may change from version to version\n"
|
|
|
|
"// without notice, or even be removed.\n"
|
|
|
|
"//\n"
|
|
|
|
"// We mean it.\n"
|
|
|
|
"//\n\n";
|
|
|
|
|
2019-05-27 17:13:54 +00:00
|
|
|
QFile f("../../src/corelib/text/qunicodetables.cpp");
|
2025-01-17 12:13:12 +00:00
|
|
|
if (!f.open(QFile::WriteOnly|QFile::Truncate))
|
|
|
|
qFatal() << "Cannot open output file" << f.fileName() << "error:" << f.errorString();
|
2011-04-27 10:05:43 +00:00
|
|
|
f.write(header);
|
|
|
|
f.write(note);
|
2012-04-23 03:00:16 +00:00
|
|
|
f.write("#include \"qunicodetables_p.h\"\n\n");
|
2011-04-27 10:05:43 +00:00
|
|
|
f.write("QT_BEGIN_NAMESPACE\n\n");
|
2025-08-15 17:49:56 +00:00
|
|
|
f.write("namespace QUnicodeTables {\n");
|
2011-04-27 10:05:43 +00:00
|
|
|
f.write(properties);
|
2012-04-23 03:00:16 +00:00
|
|
|
f.write(specialCases);
|
2011-04-27 10:05:43 +00:00
|
|
|
f.write(compositions);
|
|
|
|
f.write(ligatures);
|
2012-04-23 03:00:16 +00:00
|
|
|
f.write("\n");
|
2011-04-27 10:05:43 +00:00
|
|
|
f.write(normalizationCorrections);
|
2021-07-30 10:09:46 +00:00
|
|
|
f.write(idnaMapping);
|
2012-04-23 03:00:16 +00:00
|
|
|
f.write("} // namespace QUnicodeTables\n\n");
|
|
|
|
f.write("using namespace QUnicodeTables;\n\n");
|
2011-04-27 10:05:43 +00:00
|
|
|
f.write("QT_END_NAMESPACE\n");
|
|
|
|
f.close();
|
|
|
|
|
2019-05-27 17:13:54 +00:00
|
|
|
f.setFileName("../../src/corelib/text/qunicodetables_p.h");
|
2025-01-17 12:13:12 +00:00
|
|
|
if (!f.open(QFile::WriteOnly | QFile::Truncate))
|
|
|
|
qFatal() << "Cannot open output file" << f.fileName() << "error:" << f.errorString();
|
2011-04-27 10:05:43 +00:00
|
|
|
f.write(header);
|
|
|
|
f.write(note);
|
|
|
|
f.write(warning);
|
|
|
|
f.write("#ifndef QUNICODETABLES_P_H\n"
|
|
|
|
"#define QUNICODETABLES_P_H\n\n"
|
2017-12-11 10:28:11 +00:00
|
|
|
"#include <QtCore/private/qglobal_p.h>\n\n"
|
2011-04-27 10:05:43 +00:00
|
|
|
"#include <QtCore/qchar.h>\n\n"
|
|
|
|
"QT_BEGIN_NAMESPACE\n\n");
|
2017-12-11 10:28:11 +00:00
|
|
|
f.write("#define UNICODE_DATA_VERSION " DATA_VERSION_STR "\n\n");
|
2011-04-27 10:05:43 +00:00
|
|
|
f.write("namespace QUnicodeTables {\n\n");
|
|
|
|
f.write(property_string);
|
2019-09-03 21:40:56 +00:00
|
|
|
f.write(sizeOfPropertiesStructCheck);
|
2022-05-06 11:44:58 +00:00
|
|
|
f.write(east_asian_width_string);
|
2012-06-17 01:55:07 +00:00
|
|
|
f.write(grapheme_break_class_string);
|
|
|
|
f.write(word_break_class_string);
|
|
|
|
f.write(sentence_break_class_string);
|
2012-04-23 03:00:16 +00:00
|
|
|
f.write(line_break_class_string);
|
2021-07-30 10:09:46 +00:00
|
|
|
f.write(idna_status_string);
|
2024-03-19 07:59:18 +00:00
|
|
|
f.write(emoji_flags_string);
|
2011-04-27 10:05:43 +00:00
|
|
|
f.write(methods);
|
|
|
|
f.write("} // namespace QUnicodeTables\n\n"
|
|
|
|
"QT_END_NAMESPACE\n\n"
|
|
|
|
"#endif // QUNICODETABLES_P_H\n");
|
|
|
|
f.close();
|
|
|
|
|
2020-07-31 13:36:44 +00:00
|
|
|
qDebug() << "maxMirroredDiff = " << Qt::hex << maxMirroredDiff;
|
|
|
|
qDebug() << "maxLowerCaseDiff = " << Qt::hex << maxLowerCaseDiff;
|
|
|
|
qDebug() << "maxUpperCaseDiff = " << Qt::hex << maxUpperCaseDiff;
|
|
|
|
qDebug() << "maxTitleCaseDiff = " << Qt::hex << maxTitleCaseDiff;
|
|
|
|
qDebug() << "maxCaseFoldDiff = " << Qt::hex << maxCaseFoldDiff;
|
2011-04-27 10:05:43 +00:00
|
|
|
#if 0
|
|
|
|
// dump(0, 0x7f);
|
|
|
|
// dump(0x620, 0x640);
|
|
|
|
// dump(0x10000, 0x10020);
|
|
|
|
// dump(0x10800, 0x10820);
|
|
|
|
|
|
|
|
qDebug("decompositionLength used:");
|
|
|
|
int totalcompositions = 0;
|
|
|
|
int sum = 0;
|
|
|
|
for (int i = 1; i < 20; ++i) {
|
|
|
|
qDebug(" length %d used %d times", i, decompositionLength.value(i, 0));
|
|
|
|
totalcompositions += i*decompositionLength.value(i, 0);
|
|
|
|
sum += decompositionLength.value(i, 0);
|
|
|
|
}
|
|
|
|
qDebug(" len decomposition map %d, average length %f, num composed chars %d",
|
|
|
|
totalcompositions, (float)totalcompositions/(float)sum, sum);
|
|
|
|
qDebug("highest composed character %x", highestComposedCharacter);
|
|
|
|
qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature);
|
|
|
|
|
|
|
|
qBubbleSort(ligatures);
|
|
|
|
for (int i = 0; i < ligatures.size(); ++i)
|
|
|
|
qDebug("%s", ligatures.at(i).data());
|
|
|
|
|
|
|
|
// qDebug("combiningClass usage:");
|
|
|
|
// int numClasses = 0;
|
|
|
|
// for (int i = 0; i < 255; ++i) {
|
|
|
|
// int num = combiningClassUsage.value(i, 0);
|
|
|
|
// if (num) {
|
|
|
|
// ++numClasses;
|
|
|
|
// qDebug(" combiningClass %d used %d times", i, num);
|
|
|
|
// }
|
|
|
|
// }
|
|
|
|
// qDebug("total of %d combining classes used", numClasses);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
}
|