Update the Unicode BiDi algorithm to be compliant with Unicode 10

The UBA in Qt was out of date, implementing the spec from pre
Unicode 6.3 days. It missed handling of directional isolates and
paired brackets.

This adds a completely new implementation of the UBA, that is
compliant with Unicode 10.

Added the test data from Unicode 10 to the qcomplextext auto
test and ensure that we pass the test suite.

Task-number: QTBUG-57743
Change-Id: Ie2d957bc9775d82f0a51d1c78dc6bd154f22847c
Reviewed-by: Lars Knoll <lars.knoll@qt.io>
This commit is contained in:
Lars Knoll 2018-01-05 12:18:27 +01:00
parent 4804d42ab8
commit 7f504283ef
7 changed files with 595268 additions and 572 deletions

File diff suppressed because it is too large Load Diff

View File

@ -146,9 +146,17 @@ struct Q_AUTOTEST_EXPORT QScriptAnalysis
TabOrObject = Tab,
Object = 7
};
unsigned short script : 7;
unsigned short bidiLevel : 6; // Unicode Bidi algorithm embedding level (0-61)
unsigned short flags : 3;
enum BidiFlags {
BidiBN = 1,
BidiMaybeResetToParagraphLevel = 2,
BidiResetToParagraphLevel = 4,
BidiMirrored = 8
};
unsigned short script : 8;
unsigned short flags : 4;
unsigned short bidiFlags : 4;
unsigned short bidiLevel : 8; // Unicode Bidi algorithm embedding level (0-125)
QChar::Direction bidiDirection : 8; // used when running the bidi algorithm
inline bool operator == (const QScriptAnalysis &other) const {
return script == other.script && bidiLevel == other.bidiLevel && flags == other.flags;
}

View File

@ -106,33 +106,62 @@ const LV logical_visual[] = {
// LRO: \342\200\255
// RLO: \342\200\256
{ "override1", "\342\200\256hello\342\200\254", "\342\200\254olleh\342\200\256", QChar::DirL },
{ "override1", "\342\200\256hello\342\200\254", "\342\200\256olleh\342\200\254", QChar::DirL },
{ "override2", "\342\200\255hello\342\200\254", "\342\200\255hello\342\200\254", QChar::DirL },
{ "override3", "\342\200\255\327\251\327\234\327\225\327\235\342\200\254", "\342\200\255\327\251\327\234\327\225\327\235\342\200\254", QChar::DirL },
{ "override4", "\342\200\256\327\251\327\234\327\225\327\235\342\200\254", "\342\200\254\327\235\327\225\327\234\327\251\342\200\256", QChar::DirL },
{ "override4", "\342\200\256\327\251\327\234\327\225\327\235\342\200\254", "\342\200\256\327\235\327\225\327\234\327\251\342\200\254", QChar::DirL },
{ "override5", "\342\200\256hello\342\200\254", "\342\200\254olleh\342\200\256", QChar::DirR },
{ "override6", "\342\200\255hello\342\200\254", "\342\200\255hello\342\200\254", QChar::DirR },
{ "override7", "\342\200\255\327\251\327\234\327\225\327\235\342\200\254", "\342\200\255\327\251\327\234\327\225\327\235\342\200\254", QChar::DirR },
{ "override6", "\342\200\255hello\342\200\254", "\342\200\254hello\342\200\255", QChar::DirR },
{ "override7", "\342\200\255\327\251\327\234\327\225\327\235\342\200\254", "\342\200\254\327\251\327\234\327\225\327\235\342\200\255", QChar::DirR },
{ "override8", "\342\200\256\327\251\327\234\327\225\327\235\342\200\254", "\342\200\254\327\235\327\225\327\234\327\251\342\200\256", QChar::DirR },
{ "override9", "\327\224\342\200\255\327\251\327\234\342\200\256hello\342\200\254\327\225\327\235\342\200\254", "\342\200\255\327\251\327\234\342\200\254olleh\342\200\256\327\225\327\235\342\200\254\327\224", QChar::DirL },
{ "override10", "\327\224\342\200\255\327\251\327\234\342\200\256hello\342\200\254\327\225\327\235\342\200\254", "\342\200\255\327\251\327\234\342\200\254olleh\342\200\256\327\225\327\235\342\200\254\327\224", QChar::DirR },
{ "override9", "\327\224\342\200\255\327\251\327\234\342\200\256hello\342\200\254\327\225\327\235\342\200\254",
"\327\251\327\234\342\200\256olleh\342\200\254\327\225\327\235\342\200\255\327\224\342\200\254", QChar::DirL },
{ "override10", "\327\224\342\200\255\327\251\327\234\342\200\256hello\342\200\254\327\225\327\235\342\200\254",
"\342\200\254\327\251\327\234\342\200\256olleh\342\200\254\327\225\327\235\342\200\255\327\224", QChar::DirR },
{ "embed1", "\342\200\252hello\342\200\254", "\342\200\252hello\342\200\254", QChar::DirL },
{ "embed2", "\342\200\253hello\342\200\254", "\342\200\254hello\342\200\253", QChar::DirL },
{ "embed3", "\342\200\252hello\342\200\254", "\342\200\252hello\342\200\254", QChar::DirR },
{ "embed2", "\342\200\253hello\342\200\254", "\342\200\253hello\342\200\254", QChar::DirL },
{ "embed3", "\342\200\252hello\342\200\254", "\342\200\254hello\342\200\252", QChar::DirR },
{ "embed4", "\342\200\253hello\342\200\254", "\342\200\254hello\342\200\253", QChar::DirR },
{ "embed5", "\342\200\252\327\251\327\234\327\225\327\235\342\200\254", "\342\200\252\327\235\327\225\327\234\327\251\342\200\254", QChar::DirL },
{ "embed6", "\342\200\253\327\251\327\234\327\225\327\235\342\200\254", "\342\200\254\327\235\327\225\327\234\327\251\342\200\253", QChar::DirL },
{ "embed7", "\342\200\252\327\251\327\234\327\225\327\235\342\200\254", "\342\200\252\327\235\327\225\327\234\327\251\342\200\254", QChar::DirR },
{ "embed6", "\342\200\253\327\251\327\234\327\225\327\235\342\200\254", "\342\200\253\327\235\327\225\327\234\327\251\342\200\254", QChar::DirL },
{ "embed7", "\342\200\252\327\251\327\234\327\225\327\235\342\200\254", "\342\200\254\327\235\327\225\327\234\327\251\342\200\252", QChar::DirR },
{ "embed8", "\342\200\253\327\251\327\234\327\225\327\235\342\200\254", "\342\200\254\327\235\327\225\327\234\327\251\342\200\253", QChar::DirR },
{ "embed9", "\342\200\252x \327\251\327\234\327\225\327\235 y\342\200\254", "\342\200\252x \327\235\327\225\327\234\327\251 y\342\200\254", QChar::DirL },
{ "embed10", "\342\200\253x \327\251\327\234\327\225\327\235 y\342\200\254", "\342\200\254y \327\235\327\225\327\234\327\251 x\342\200\253", QChar::DirL },
{ "embed11", "\342\200\252x \327\251\327\234\327\225\327\235 y\342\200\254", "\342\200\252x \327\235\327\225\327\234\327\251 y\342\200\254", QChar::DirR },
{ "embed10", "\342\200\253x \327\251\327\234\327\225\327\235 y\342\200\254", "\342\200\253y \327\235\327\225\327\234\327\251 x\342\200\254", QChar::DirL },
{ "embed11", "\342\200\252x \327\251\327\234\327\225\327\235 y\342\200\254", "\342\200\254x \327\235\327\225\327\234\327\251 y\342\200\252", QChar::DirR },
{ "embed12", "\342\200\253x \327\251\327\234\327\225\327\235 y\342\200\254", "\342\200\254y \327\235\327\225\327\234\327\251 x\342\200\253", QChar::DirR },
{ "zwsp", "+0\342\200\213f-1", "+0\342\200\213f-1", QChar::DirL },
// Alef: \xD7\x90
{ "bracketpair_1_ltr", "\xD7\x90(\xD7\x90[&a]!)a", "\xD7\x90(\xD7\x90[&a]!)a", QChar::DirL },
{ "bracketpair_1_rtl", "\xD7\x90(\xD7\x90[&a]!)a", "a(![a&]\xD7\x90)\xD7\x90", QChar::DirR },
{ "bracketpair_2_ltr", "a(\xD7\x90[&a]!)\xD7\x90", "a(\xD7\x90[&a]!)\xD7\x90", QChar::DirL },
{ "bracketpair_2_rtl", "a(\xD7\x90[&a]!)\xD7\x90", "\xD7\x90(![a&]\xD7\x90)a", QChar::DirR },
{ "bracketpair_3_ltr", "\xD7\x90(a[&\xD7\x90]!)a", "\xD7\x90(a[&\xD7\x90]!)a", QChar::DirL },
{ "bracketpair_3_rtl", "\xD7\x90(a[&\xD7\x90]!)a", "a(![\xD7\x90&]a)\xD7\x90", QChar::DirR },
{ "bracketpair_4_ltr", "a (a \xD7\x90) \xD7\x90", "a (a \xD7\x90) \xD7\x90", QChar::DirL },
{ "bracketpair_4_rtl", "a (a \xD7\x90) \xD7\x90", "\xD7\x90 (\xD7\x90 a) a", QChar::DirR },
{ "bracketpair_5_ltr", "\xD7\x90 (a \xD7\x90) a", "\xD7\x90 (a \xD7\x90) a", QChar::DirL },
{ "bracketpair_5_rtl", "\xD7\x90 (a \xD7\x90) a", "a (\xD7\x90 a) \xD7\x90", QChar::DirR },
{ "bracketpair_6_ltr", "a (\xD7\x90 a) \xD7\x90", "a (\xD7\x90 a) \xD7\x90", QChar::DirL },
{ "bracketpair_6_rtl", "a (\xD7\x90 a) \xD7\x90", "\xD7\x90 (a \xD7\x90) a", QChar::DirR },
{ "bracketpair_7_ltr", "\xD7\x90\xD7\x90 book(s)", "\xD7\x90\xD7\x90 book(s)", QChar::DirL },
{ "bracketpair_7_rtl", "\xD7\x90\xD7\x90 book(s)", "book(s) \xD7\x90\xD7\x90", QChar::DirR },
{ "bracketpair_8_ltr", "a \xD7\x90\xD7\x90(\xD7\x90)", "a (\xD7\x90)\xD7\x90\xD7\x90", QChar::DirL },
{ "bracketpair_8_rtl", "a \xD7\x90\xD7\x90(\xD7\x90)", "(\xD7\x90)\xD7\x90\xD7\x90 a", QChar::DirR },
{ 0, 0, 0, QChar::DirON }
};

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -3,3 +3,10 @@ TARGET = tst_qcomplextext
QT += testlib
QT += core-private gui-private
SOURCES += tst_qcomplextext.cpp
TESTDATA += data
android {
RESOURCES += \
testdata.qrc
}

View File

@ -47,6 +47,12 @@ private slots:
void bidiCursorLogicalMovement();
void bidiInvalidCursorNoMovement_data();
void bidiInvalidCursorNoMovement();
void bidiCharacterTest_data();
void bidiCharacterTest();
void bidiTest_data();
void bidiTest();
};
void tst_QComplexText::bidiReorderString_data()
@ -93,7 +99,7 @@ void tst_QComplexText::bidiReorderString()
QString sub = logical.mid(si.position, e.length(visualOrder[i]));
if (si.analysis.bidiLevel % 2) {
// reverse sub
QChar *a = (QChar *)sub.unicode();
QChar *a = sub.data();
QChar *b = a + sub.length() - 1;
while (a < b) {
QChar tmp = *a;
@ -273,5 +279,282 @@ void tst_QComplexText::bidiCursor_PDF()
QVERIFY(line.cursorToX(size) == line.cursorToX(size - 1));
}
void tst_QComplexText::bidiCharacterTest_data()
{
QTest::addColumn<QString>("data");
QTest::addColumn<int>("paragraphDirection");
QTest::addColumn<QVector<int>>("resolvedLevels");
QTest::addColumn<QVector<int>>("visualOrder");
QString testFile = QFINDTESTDATA("data/BidiCharacterTest.txt");
QFile f(testFile);
QVERIFY(f.exists());
f.open(QIODevice::ReadOnly);
int linenum = 0;
while (!f.atEnd()) {
linenum++;
QByteArray line = f.readLine().simplified();
if (line.startsWith('#') || line.isEmpty())
continue;
QVERIFY(!line.contains('#'));
QList<QByteArray> parts = line.split(';');
QVERIFY(parts.size() == 5);
QString data;
QList<QByteArray> dataParts = parts.at(0).split(' ');
for (const auto &p : dataParts) {
bool ok;
data += QChar((ushort)p.toInt(&ok, 16));
QVERIFY(ok);
}
int paragraphDirection = parts.at(1).toInt();
// int resolvedParagraphLevel = parts.at(2).toInt();
QVector<int> resolvedLevels;
QList<QByteArray> levelParts = parts.at(3).split(' ');
for (const auto &p : levelParts) {
if (p == "x") {
resolvedLevels += -1;
} else {
bool ok;
resolvedLevels += p.toInt(&ok);
QVERIFY(ok);
}
}
QVector<int> visualOrder;
QList<QByteArray> orderParts = parts.at(4).split(' ');
for (const auto &p : orderParts) {
bool ok;
visualOrder += p.toInt(&ok);
QVERIFY(ok);
}
const QByteArray nm = "line #" + QByteArray::number(linenum);
QTest::newRow(nm.constData()) << data << paragraphDirection << resolvedLevels << visualOrder;
}
}
static void testBidiString(const QString &data, int paragraphDirection, const QVector<int> &resolvedLevels, const QVector<int> &visualOrder)
{
Q_UNUSED(resolvedLevels);
QTextEngine e(data, QFont());
Qt::LayoutDirection pDir = Qt::LeftToRight;
if (paragraphDirection == 1)
pDir = Qt::RightToLeft;
else if (paragraphDirection == 2)
pDir = Qt::LayoutDirectionAuto;
e.option.setTextDirection(pDir);
e.itemize();
quint8 levels[1024];
int visual[1024];
int nitems = e.layoutData->items.size();
int i;
for (i = 0; i < nitems; ++i) {
//qDebug("item %d bidiLevel=%d", i, e.items[i].analysis.bidiLevel);
levels[i] = e.layoutData->items[i].analysis.bidiLevel;
}
e.bidiReorder(nitems, levels, visual);
QString visualString;
for (i = 0; i < nitems; ++i) {
QScriptItem &si = e.layoutData->items[visual[i]];
QString sub;
for (int j = si.position; j < si.position + e.length(visual[i]); ++j) {
switch (data.at(j).direction()) {
case QChar::DirLRE:
case QChar::DirRLE:
case QChar::DirLRO:
case QChar::DirRLO:
case QChar::DirPDF:
case QChar::DirBN:
continue;
default:
break;
}
sub += data.at(j);
}
// remove explicit embedding characters, as the test data has them removed as well
sub.remove(QChar(0x202a));
sub.remove(QChar(0x202b));
sub.remove(QChar(0x202c));
sub.remove(QChar(0x202d));
sub.remove(QChar(0x202e));
if (si.analysis.bidiLevel % 2) {
// reverse sub
QChar *a = sub.data();
QChar *b = a + sub.length() - 1;
while (a < b) {
QChar tmp = *a;
*a = *b;
*b = tmp;
++a;
--b;
}
a = (QChar *)sub.unicode();
b = a + sub.length();
// while (a<b) {
// *a = a->mirroredChar();
// ++a;
// }
}
visualString += sub;
}
QString expected;
// qDebug() << "expected visual order";
for (int i : visualOrder) {
// qDebug() << " " << i << hex << data[i].unicode();
expected.append(data[i]);
}
QCOMPARE(visualString, expected);
}
void tst_QComplexText::bidiCharacterTest()
{
QFETCH(QString, data);
QFETCH(int, paragraphDirection);
QFETCH(QVector<int>, resolvedLevels);
QFETCH(QVector<int>, visualOrder);
testBidiString(data, paragraphDirection, resolvedLevels, visualOrder);
}
ushort unicodeForDirection(const QByteArray &direction)
{
struct {
const char *string;
ushort unicode;
} dirToUnicode[] = {
{ "L", 0x41 },
{ "R", 0x5d0 },
{ "EN", 0x30 },
{ "ES", 0x2b },
{ "ET", 0x24 },
{ "AN", 0x660 },
{ "CS", 0x2c },
{ "B", QChar::ParagraphSeparator },
{ "S", 0x9 },
{ "WS", 0x20 },
{ "ON", 0x2a },
{ "LRE", 0x202a },
{ "LRO", 0x202d },
{ "AL", 0x627 },
{ "RLE", 0x202b },
{ "RLO", 0x202e },
{ "PDF", 0x202c },
{ "NSM", 0x300 },
{ "BN", 0xad },
{ "LRI", 0x2066 },
{ "RLI", 0x2067 },
{ "FSI", 0x2068 },
{ "PDI", 0x2069 }
};
for (const auto &e : dirToUnicode) {
if (e.string == direction)
return e.unicode;
}
Q_UNREACHABLE();
}
void tst_QComplexText::bidiTest_data()
{
QTest::addColumn<QString>("data");
QTest::addColumn<int>("paragraphDirection");
QTest::addColumn<QVector<int>>("resolvedLevels");
QTest::addColumn<QVector<int>>("visualOrder");
QString testFile = QFINDTESTDATA("data/BidiTest.txt");
QFile f(testFile);
QVERIFY(f.exists());
f.open(QIODevice::ReadOnly);
int linenum = 0;
QVector<int> resolvedLevels;
QVector<int> visualOrder;
while (!f.atEnd()) {
linenum++;
QByteArray line = f.readLine().simplified();
if (line.startsWith('#') || line.isEmpty())
continue;
QVERIFY(!line.contains('#'));
if (line.startsWith("@Levels:")) {
line = line.mid(strlen("@Levels:")).simplified();
resolvedLevels.clear();
QList<QByteArray> levelParts = line.split(' ');
for (const auto &p : levelParts) {
if (p == "x") {
resolvedLevels += -1;
} else {
bool ok;
resolvedLevels += p.toInt(&ok);
QVERIFY(ok);
}
}
continue;
} else if (line.startsWith("@Reorder:")) {
line = line.mid(strlen("@Reorder:")).simplified();
visualOrder.clear();
QList<QByteArray> orderParts = line.split(' ');
for (const auto &p : orderParts) {
if (p.isEmpty())
continue;
bool ok;
visualOrder += p.toInt(&ok);
QVERIFY(ok);
}
continue;
}
QList<QByteArray> parts = line.split(';');
Q_ASSERT(parts.size() == 2);
QString data;
QList<QByteArray> dataParts = parts.at(0).split(' ');
for (const auto &p : dataParts) {
ushort uc = unicodeForDirection(p);
data += QChar(uc);
}
int paragraphDirections = parts.at(1).toInt();
const QByteArray nm = "line #" + QByteArray::number(linenum);
if (paragraphDirections & 1)
QTest::newRow((nm + " (Auto)").constData()) << data << 2 << resolvedLevels << visualOrder;
if (paragraphDirections & 2)
QTest::newRow((nm + " (LTR)").constData()) << data << 0 << resolvedLevels << visualOrder;
if (paragraphDirections & 4)
QTest::newRow((nm + " (RTL)").constData()) << data << 1 << resolvedLevels << visualOrder;
}
}
void tst_QComplexText::bidiTest()
{
QFETCH(QString, data);
QFETCH(int, paragraphDirection);
QFETCH(QVector<int>, resolvedLevels);
QFETCH(QVector<int>, visualOrder);
testBidiString(data, paragraphDirection, resolvedLevels, visualOrder);
}
QTEST_MAIN(tst_QComplexText)
#include "tst_qcomplextext.moc"