util/unicode: Extract Method parseHexRange()

Wrapping parseHexList(), which gets extended to support QLatin1StringView separators, add parseHexRange() and use it around the code to parse HHHHH[..HHHHH] hex ranges. Amends the start of the public history. Pick-to: 6.10 6.9 6.8 6.5 Change-Id: I0372e5c239642988f0e920d95108657e276b19dd Reviewed-by: Ahmad Samir <a.samirh78@gmail.com>
2025-08-27 12:09:12 +02:00 · 2025-08-27 12:09:12 +02:00 · 6e526bf92c
parent 714969e8a1
commit 6e526bf92c
1 changed files with 29 additions and 130 deletions
--- a/util/unicode/main.cpp
+++ b/util/unicode/main.cpp
@ -1348,16 +1348,32 @@ static int parseHex(QByteArrayView input, int lineNo)
    return result;
 }

-QVarLengthArray<int, 4> parseHexList(QByteArrayView input, int lineNo)
+template <typename Sep = char16_t>
+QVarLengthArray<int, 4> parseHexList(QByteArrayView input, int lineNo, Sep sep = u' ')
 {
    QVarLengthArray<int, 4> result;
-    constexpr char16_t sep = u' ';
-    constexpr auto sb = Qt::SkipEmptyParts;
+    const auto sb = sep == u' ' ? Qt::SkipEmptyParts : Qt::KeepEmptyParts;
    for (auto e : qTokenize(QLatin1StringView{input}, sep, sb))
        result.push_back(parseHex(e, lineNo));
    return result;
 }

+static auto parseHexRange(QByteArrayView input, int lineNo)
+{
+    struct R { int from, to; };
+
+    const auto pair = parseHexList(input, lineNo, ".."_L1);
+    Q_ASSERT(pair.size() <= 2);
+    int from = pair[0];
+    int to = from;
+    if (pair.size() == 2) {
+        to = pair[1];
+        if (from > to)
+            qFatal("invalid range in line %d: %05x > %05x", lineNo, from, to);
+    }
+    return R{from, to};
+}
+
 static void readUnicodeData()
 {
    qDebug("Reading UnicodeData.txt");
@ -1582,25 +1598,12 @@ static void readDerivedAge()
 {
    readUnicodeFile("DerivedAge.txt",
                    [] (QByteArray &line, int lineNo) {
-        Q_UNUSED(lineNo);
-
        line.replace(" ", "");

        QList<QByteArray> l = line.split(';');
        Q_ASSERT(l.size() == 2);

-        QByteArray codes = l[0];
-        codes.replace("..", ".");
-        QList<QByteArray> cl = codes.split('.');
-
-        bool ok;
-        int from = cl[0].toInt(&ok, 16);
-        Q_ASSERT(ok);
-        int to = from;
-        if (cl.size() == 2) {
-            to = cl[1].toInt(&ok, 16);
-            Q_ASSERT(ok);
-        }
+        const auto [from, to] = parseHexRange(l[0], lineNo);

        QChar::UnicodeVersion age = age_map.value(l[1].trimmed(), QChar::Unicode_Unassigned);
        //qDebug() << Qt::hex << from << ".." << to << ba << age;
@ -1618,17 +1621,12 @@ static void readEastAsianWidth()
 {
    readUnicodeFile("EastAsianWidth.txt",
                    [] (QByteArray &line, int lineNo) {
-        Q_UNUSED(lineNo);
-
        line = std::move(line).simplified();

        QList<QByteArray> fields = line.split(';');
        Q_ASSERT(fields.size() == 2);

-        // That would be split(".."), but that API does not exist.
-        const QByteArray codePoints = fields[0].trimmed().replace("..", ".");
-        QList<QByteArray> cl = codePoints.split('.');
-        Q_ASSERT(cl.size() >= 1 && cl.size() <= 2);
+        const auto [first, last] = parseHexRange(fields[0], lineNo);

        const QByteArray widthString = fields[1].trimmed();
        if (!eastAsianWidthMap.contains(widthString)) {
@ -1637,11 +1635,6 @@ static void readEastAsianWidth()
        }
        auto width = eastAsianWidthMap.value(widthString);

-        bool ok;
-        const int first = cl[0].toInt(&ok, 16);
-        const int last = ok && cl.size() == 2 ? cl[1].toInt(&ok, 16) : first;
-        Q_ASSERT(ok);
-
        for (int codepoint = first; codepoint <= last; ++codepoint) {
            UnicodeData &ud = UnicodeData::valueRef(codepoint);
            // Ensure that ranges don't overlap.
@ -1655,8 +1648,6 @@ static void readDerivedNormalizationProps()
 {
    readUnicodeFile("DerivedNormalizationProps.txt",
                    [] (const QByteArray &line, int lineNo) {
-        Q_UNUSED(lineNo);
-
        QList<QByteArray> l = line.split(';');
        Q_ASSERT(l.size() >= 2);

@ -1668,18 +1659,7 @@ static void readDerivedNormalizationProps()
            return;
        }

-        QByteArray codes = l[0].trimmed();
-        codes.replace("..", ".");
-        QList<QByteArray> cl = codes.split('.');
-
-        bool ok;
-        int from = cl[0].toInt(&ok, 16);
-        Q_ASSERT(ok);
-        int to = from;
-        if (cl.size() == 2) {
-            to = cl[1].toInt(&ok, 16);
-            Q_ASSERT(ok);
-        }
+        const auto [from, to] = parseHexRange(l[0], lineNo);

        for (int codepoint = from; codepoint <= to; ++codepoint) {
            UnicodeData &d = UnicodeData::valueRef(codepoint);
@ -1797,24 +1777,12 @@ static void readLineBreak()
 {
    readUnicodeFile("LineBreak.txt",
                    [] (QByteArray &line, int lineNo) {
-        Q_UNUSED(lineNo);
        line.replace(" ", "");

        QList<QByteArray> l = line.split(';');
        Q_ASSERT(l.size() == 2);

-        QByteArray codes = l[0];
-        codes.replace("..", ".");
-        QList<QByteArray> cl = codes.split('.');
-
-        bool ok;
-        int from = cl[0].toInt(&ok, 16);
-        Q_ASSERT(ok);
-        int to = from;
-        if (cl.size() == 2) {
-            to = cl[1].toInt(&ok, 16);
-            Q_ASSERT(ok);
-        }
+        const auto [from, to] = parseHexRange(l[0], lineNo);

        LineBreakClass lb = line_break_map.value(l[1], LineBreak_Unassigned);
        if (lb == LineBreak_Unassigned)
@ -1920,25 +1888,13 @@ static void readGraphemeBreak()
 {
    readUnicodeFile("GraphemeBreakProperty.txt",
                    [] (QByteArray &line, int lineNo) {
-        Q_UNUSED(lineNo);

        line.replace(" ", "");

        QList<QByteArray> l = line.split(';');
        Q_ASSERT(l.size() == 2);

-        QByteArray codes = l[0];
-        codes.replace("..", ".");
-        QList<QByteArray> cl = codes.split('.');
-
-        bool ok;
-        int from = cl[0].toInt(&ok, 16);
-        Q_ASSERT(ok);
-        int to = from;
-        if (cl.size() == 2) {
-            to = cl[1].toInt(&ok, 16);
-            Q_ASSERT(ok);
-        }
+        const auto [from, to] = parseHexRange(l[0], lineNo);

        GraphemeBreakClass brk = grapheme_break_map.value(l[1], GraphemeBreak_Unassigned);
        if (brk == GraphemeBreak_Unassigned)
@ -1955,7 +1911,6 @@ static void readEmojiData()
 {
    readUnicodeFile("emoji-data.txt",
                    [] (QByteArray &line, int lineNo) {
-        Q_UNUSED(lineNo);
        line.replace(" ", "");

        QList<QByteArray> l = line.split(';');
@ -1965,18 +1920,7 @@ static void readEmojiData()
        if (emojiFlags == EmojiFlags::NoEmoji)
            return;

-        QByteArray codes = l[0];
-        codes.replace("..", ".");
-        QList<QByteArray> cl = codes.split('.');
-
-        bool ok;
-        int from = cl[0].toInt(&ok, 16);
-        Q_ASSERT(ok);
-        int to = from;
-        if (cl.size() == 2) {
-            to = cl[1].toInt(&ok, 16);
-            Q_ASSERT(ok);
-        }
+        const auto [from, to] = parseHexRange(l[0], lineNo);

        for (int codepoint = from; codepoint <= to; ++codepoint) {
            UnicodeData &ud = UnicodeData::valueRef(codepoint);
@ -1996,24 +1940,12 @@ static void readWordBreak()
 {
    readUnicodeFile("WordBreakProperty.txt",
                    [] (QByteArray &line, int lineNo) {
-        Q_UNUSED(lineNo);
        line.replace(" ", "");

        QList<QByteArray> l = line.split(';');
        Q_ASSERT(l.size() == 2);

-        QByteArray codes = l[0];
-        codes.replace("..", ".");
-        QList<QByteArray> cl = codes.split('.');
-
-        bool ok;
-        int from = cl[0].toInt(&ok, 16);
-        Q_ASSERT(ok);
-        int to = from;
-        if (cl.size() == 2) {
-            to = cl[1].toInt(&ok, 16);
-            Q_ASSERT(ok);
-        }
+        const auto [from, to] = parseHexRange(l[0], lineNo);

        WordBreakClass brk = word_break_map.value(l[1], WordBreak_Unassigned);
        if (brk == WordBreak_Unassigned)
@ -2039,24 +1971,12 @@ static void readSentenceBreak()
 {
    readUnicodeFile("SentenceBreakProperty.txt",
                    [] (QByteArray &line, int lineNo) {
-        Q_UNUSED(lineNo);
        line.replace(" ", "");

        QList<QByteArray> l = line.split(';');
        Q_ASSERT(l.size() == 2);

-        QByteArray codes = l[0];
-        codes.replace("..", ".");
-        QList<QByteArray> cl = codes.split('.');
-
-        bool ok;
-        int from = cl[0].toInt(&ok, 16);
-        Q_ASSERT(ok);
-        int to = from;
-        if (cl.size() == 2) {
-            to = cl[1].toInt(&ok, 16);
-            Q_ASSERT(ok);
-        }
+        const auto [from, to] = parseHexRange(l[0], lineNo);

        SentenceBreakClass brk = sentence_break_map.value(l[1], SentenceBreak_Unassigned);
        if (brk == SentenceBreak_Unassigned)
@ -2263,7 +2183,6 @@ static void readScripts()
 {
    readUnicodeFile("Scripts.txt",
                    [] (QByteArray &line, int lineNo) {
-        Q_UNUSED(lineNo);
        line.replace(" ", "");
        line.replace("_", "");

@ -2275,17 +2194,7 @@ static void readScripts()
        QByteArray codePoints = line.left(semicolon);
        QByteArray scriptName = line.mid(semicolon + 1);

-        codePoints.replace("..", ".");
-        QList<QByteArray> cl = codePoints.split('.');
-
-        bool ok;
-        int first = cl[0].toInt(&ok, 16);
-        Q_ASSERT(ok);
-        int last = first;
-        if (cl.size() == 2) {
-            last = cl[1].toInt(&ok, 16);
-            Q_ASSERT(ok);
-        }
+        const auto [first, last] = parseHexRange(codePoints, lineNo);

        if (!scriptMap.contains(scriptName))
            qFatal("Unhandled script property value: %s", scriptName.constData());
@ -2304,17 +2213,12 @@ static void readIdnaMappingTable()
 {
    readUnicodeFile("IdnaMappingTable.txt",
                    [] (QByteArray &line, int lineNo) {
-        Q_UNUSED(lineNo);
-
        line = std::move(line).simplified();

        QList<QByteArray> fields = line.split(';');
        Q_ASSERT(fields.size() >= 2);

-        // That would be split(".."), but that API does not exist.
-        const QByteArray codePoints = fields[0].trimmed().replace("..", ".");
-        QList<QByteArray> cl = codePoints.split('.');
-        Q_ASSERT(cl.size() >= 1 && cl.size() <= 2);
+        const auto [first, last] = parseHexRange(fields[0], lineNo);

        const QByteArray statusString = fields[1].trimmed();
        if (!idnaStatusMap.contains(statusString))
@ -2322,11 +2226,6 @@ static void readIdnaMappingTable()
                   fields[0].constData(), statusString.data());
        IdnaRawStatus rawStatus = idnaStatusMap.value(statusString);

-        bool ok;
-        const int first = cl[0].toInt(&ok, 16);
-        const int last = ok && cl.size() == 2 ? cl[1].toInt(&ok, 16) : first;
-        Q_ASSERT(ok);
-
        QString mapping;

        switch (rawStatus) {