util/unicode: Extract Method parseHexRange()

Wrapping parseHexList(), which gets extended to support
QLatin1StringView separators, add parseHexRange() and use it around
the code to parse HHHHH[..HHHHH] hex ranges.

Amends the start of the public history.

Pick-to: 6.10 6.9 6.8 6.5
Change-Id: I0372e5c239642988f0e920d95108657e276b19dd
Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
This commit is contained in:
Marc Mutz 2025-08-27 12:09:12 +02:00
parent 714969e8a1
commit 6e526bf92c
1 changed files with 29 additions and 130 deletions

View File

@ -1348,16 +1348,32 @@ static int parseHex(QByteArrayView input, int lineNo)
return result;
}
QVarLengthArray<int, 4> parseHexList(QByteArrayView input, int lineNo)
template <typename Sep = char16_t>
QVarLengthArray<int, 4> parseHexList(QByteArrayView input, int lineNo, Sep sep = u' ')
{
QVarLengthArray<int, 4> result;
constexpr char16_t sep = u' ';
constexpr auto sb = Qt::SkipEmptyParts;
const auto sb = sep == u' ' ? Qt::SkipEmptyParts : Qt::KeepEmptyParts;
for (auto e : qTokenize(QLatin1StringView{input}, sep, sb))
result.push_back(parseHex(e, lineNo));
return result;
}
static auto parseHexRange(QByteArrayView input, int lineNo)
{
struct R { int from, to; };
const auto pair = parseHexList(input, lineNo, ".."_L1);
Q_ASSERT(pair.size() <= 2);
int from = pair[0];
int to = from;
if (pair.size() == 2) {
to = pair[1];
if (from > to)
qFatal("invalid range in line %d: %05x > %05x", lineNo, from, to);
}
return R{from, to};
}
static void readUnicodeData()
{
qDebug("Reading UnicodeData.txt");
@ -1582,25 +1598,12 @@ static void readDerivedAge()
{
readUnicodeFile("DerivedAge.txt",
[] (QByteArray &line, int lineNo) {
Q_UNUSED(lineNo);
line.replace(" ", "");
QList<QByteArray> l = line.split(';');
Q_ASSERT(l.size() == 2);
QByteArray codes = l[0];
codes.replace("..", ".");
QList<QByteArray> cl = codes.split('.');
bool ok;
int from = cl[0].toInt(&ok, 16);
Q_ASSERT(ok);
int to = from;
if (cl.size() == 2) {
to = cl[1].toInt(&ok, 16);
Q_ASSERT(ok);
}
const auto [from, to] = parseHexRange(l[0], lineNo);
QChar::UnicodeVersion age = age_map.value(l[1].trimmed(), QChar::Unicode_Unassigned);
//qDebug() << Qt::hex << from << ".." << to << ba << age;
@ -1618,17 +1621,12 @@ static void readEastAsianWidth()
{
readUnicodeFile("EastAsianWidth.txt",
[] (QByteArray &line, int lineNo) {
Q_UNUSED(lineNo);
line = std::move(line).simplified();
QList<QByteArray> fields = line.split(';');
Q_ASSERT(fields.size() == 2);
// That would be split(".."), but that API does not exist.
const QByteArray codePoints = fields[0].trimmed().replace("..", ".");
QList<QByteArray> cl = codePoints.split('.');
Q_ASSERT(cl.size() >= 1 && cl.size() <= 2);
const auto [first, last] = parseHexRange(fields[0], lineNo);
const QByteArray widthString = fields[1].trimmed();
if (!eastAsianWidthMap.contains(widthString)) {
@ -1637,11 +1635,6 @@ static void readEastAsianWidth()
}
auto width = eastAsianWidthMap.value(widthString);
bool ok;
const int first = cl[0].toInt(&ok, 16);
const int last = ok && cl.size() == 2 ? cl[1].toInt(&ok, 16) : first;
Q_ASSERT(ok);
for (int codepoint = first; codepoint <= last; ++codepoint) {
UnicodeData &ud = UnicodeData::valueRef(codepoint);
// Ensure that ranges don't overlap.
@ -1655,8 +1648,6 @@ static void readDerivedNormalizationProps()
{
readUnicodeFile("DerivedNormalizationProps.txt",
[] (const QByteArray &line, int lineNo) {
Q_UNUSED(lineNo);
QList<QByteArray> l = line.split(';');
Q_ASSERT(l.size() >= 2);
@ -1668,18 +1659,7 @@ static void readDerivedNormalizationProps()
return;
}
QByteArray codes = l[0].trimmed();
codes.replace("..", ".");
QList<QByteArray> cl = codes.split('.');
bool ok;
int from = cl[0].toInt(&ok, 16);
Q_ASSERT(ok);
int to = from;
if (cl.size() == 2) {
to = cl[1].toInt(&ok, 16);
Q_ASSERT(ok);
}
const auto [from, to] = parseHexRange(l[0], lineNo);
for (int codepoint = from; codepoint <= to; ++codepoint) {
UnicodeData &d = UnicodeData::valueRef(codepoint);
@ -1797,24 +1777,12 @@ static void readLineBreak()
{
readUnicodeFile("LineBreak.txt",
[] (QByteArray &line, int lineNo) {
Q_UNUSED(lineNo);
line.replace(" ", "");
QList<QByteArray> l = line.split(';');
Q_ASSERT(l.size() == 2);
QByteArray codes = l[0];
codes.replace("..", ".");
QList<QByteArray> cl = codes.split('.');
bool ok;
int from = cl[0].toInt(&ok, 16);
Q_ASSERT(ok);
int to = from;
if (cl.size() == 2) {
to = cl[1].toInt(&ok, 16);
Q_ASSERT(ok);
}
const auto [from, to] = parseHexRange(l[0], lineNo);
LineBreakClass lb = line_break_map.value(l[1], LineBreak_Unassigned);
if (lb == LineBreak_Unassigned)
@ -1920,25 +1888,13 @@ static void readGraphemeBreak()
{
readUnicodeFile("GraphemeBreakProperty.txt",
[] (QByteArray &line, int lineNo) {
Q_UNUSED(lineNo);
line.replace(" ", "");
QList<QByteArray> l = line.split(';');
Q_ASSERT(l.size() == 2);
QByteArray codes = l[0];
codes.replace("..", ".");
QList<QByteArray> cl = codes.split('.');
bool ok;
int from = cl[0].toInt(&ok, 16);
Q_ASSERT(ok);
int to = from;
if (cl.size() == 2) {
to = cl[1].toInt(&ok, 16);
Q_ASSERT(ok);
}
const auto [from, to] = parseHexRange(l[0], lineNo);
GraphemeBreakClass brk = grapheme_break_map.value(l[1], GraphemeBreak_Unassigned);
if (brk == GraphemeBreak_Unassigned)
@ -1955,7 +1911,6 @@ static void readEmojiData()
{
readUnicodeFile("emoji-data.txt",
[] (QByteArray &line, int lineNo) {
Q_UNUSED(lineNo);
line.replace(" ", "");
QList<QByteArray> l = line.split(';');
@ -1965,18 +1920,7 @@ static void readEmojiData()
if (emojiFlags == EmojiFlags::NoEmoji)
return;
QByteArray codes = l[0];
codes.replace("..", ".");
QList<QByteArray> cl = codes.split('.');
bool ok;
int from = cl[0].toInt(&ok, 16);
Q_ASSERT(ok);
int to = from;
if (cl.size() == 2) {
to = cl[1].toInt(&ok, 16);
Q_ASSERT(ok);
}
const auto [from, to] = parseHexRange(l[0], lineNo);
for (int codepoint = from; codepoint <= to; ++codepoint) {
UnicodeData &ud = UnicodeData::valueRef(codepoint);
@ -1996,24 +1940,12 @@ static void readWordBreak()
{
readUnicodeFile("WordBreakProperty.txt",
[] (QByteArray &line, int lineNo) {
Q_UNUSED(lineNo);
line.replace(" ", "");
QList<QByteArray> l = line.split(';');
Q_ASSERT(l.size() == 2);
QByteArray codes = l[0];
codes.replace("..", ".");
QList<QByteArray> cl = codes.split('.');
bool ok;
int from = cl[0].toInt(&ok, 16);
Q_ASSERT(ok);
int to = from;
if (cl.size() == 2) {
to = cl[1].toInt(&ok, 16);
Q_ASSERT(ok);
}
const auto [from, to] = parseHexRange(l[0], lineNo);
WordBreakClass brk = word_break_map.value(l[1], WordBreak_Unassigned);
if (brk == WordBreak_Unassigned)
@ -2039,24 +1971,12 @@ static void readSentenceBreak()
{
readUnicodeFile("SentenceBreakProperty.txt",
[] (QByteArray &line, int lineNo) {
Q_UNUSED(lineNo);
line.replace(" ", "");
QList<QByteArray> l = line.split(';');
Q_ASSERT(l.size() == 2);
QByteArray codes = l[0];
codes.replace("..", ".");
QList<QByteArray> cl = codes.split('.');
bool ok;
int from = cl[0].toInt(&ok, 16);
Q_ASSERT(ok);
int to = from;
if (cl.size() == 2) {
to = cl[1].toInt(&ok, 16);
Q_ASSERT(ok);
}
const auto [from, to] = parseHexRange(l[0], lineNo);
SentenceBreakClass brk = sentence_break_map.value(l[1], SentenceBreak_Unassigned);
if (brk == SentenceBreak_Unassigned)
@ -2263,7 +2183,6 @@ static void readScripts()
{
readUnicodeFile("Scripts.txt",
[] (QByteArray &line, int lineNo) {
Q_UNUSED(lineNo);
line.replace(" ", "");
line.replace("_", "");
@ -2275,17 +2194,7 @@ static void readScripts()
QByteArray codePoints = line.left(semicolon);
QByteArray scriptName = line.mid(semicolon + 1);
codePoints.replace("..", ".");
QList<QByteArray> cl = codePoints.split('.');
bool ok;
int first = cl[0].toInt(&ok, 16);
Q_ASSERT(ok);
int last = first;
if (cl.size() == 2) {
last = cl[1].toInt(&ok, 16);
Q_ASSERT(ok);
}
const auto [first, last] = parseHexRange(codePoints, lineNo);
if (!scriptMap.contains(scriptName))
qFatal("Unhandled script property value: %s", scriptName.constData());
@ -2304,17 +2213,12 @@ static void readIdnaMappingTable()
{
readUnicodeFile("IdnaMappingTable.txt",
[] (QByteArray &line, int lineNo) {
Q_UNUSED(lineNo);
line = std::move(line).simplified();
QList<QByteArray> fields = line.split(';');
Q_ASSERT(fields.size() >= 2);
// That would be split(".."), but that API does not exist.
const QByteArray codePoints = fields[0].trimmed().replace("..", ".");
QList<QByteArray> cl = codePoints.split('.');
Q_ASSERT(cl.size() >= 1 && cl.size() <= 2);
const auto [first, last] = parseHexRange(fields[0], lineNo);
const QByteArray statusString = fields[1].trimmed();
if (!idnaStatusMap.contains(statusString))
@ -2322,11 +2226,6 @@ static void readIdnaMappingTable()
fields[0].constData(), statusString.data());
IdnaRawStatus rawStatus = idnaStatusMap.value(statusString);
bool ok;
const int first = cl[0].toInt(&ok, 16);
const int last = ok && cl.size() == 2 ? cl[1].toInt(&ok, 16) : first;
Q_ASSERT(ok);
QString mapping;
switch (rawStatus) {