2022-05-10 10:06:48 +00:00
|
|
|
// Copyright (C) 2016 The Qt Company Ltd.
|
2024-02-02 13:36:10 +00:00
|
|
|
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2021-08-18 09:48:29 +00:00
|
|
|
#include "tst_bench_qhash.h"
|
2011-04-27 10:05:43 +00:00
|
|
|
|
2012-02-28 19:56:14 +00:00
|
|
|
QT_BEGIN_NAMESPACE
|
|
|
|
|
qHash: implement an AES hasher for QLatin1StringView
It's the same aeshash() as before, except we're passing a template
parameter to indicate whether to read half and then zero-extend the
data. That is, it will perform a conversion from Latin1 on the fly.
When running in zero-extending mode, the length parameters are actually
doubled (counting the number of UTF-16 code units) and we then divide
again by 2 when advancing.
The implementation should have the following performance
characteristics:
* QLatin1StringView now will be roughly half as fast as Qt 6.7
* QLatin1StringView now will be roughly as fast as QStringView
For the aeshash128() in default builds of QtCore (will use SSE4.1), the
long loop (32 characters or more) is:
QStringView QLatin1StringView
movdqu -0x20(%rax),%xmm4 | pmovzxbw -0x10(%rdx),%xmm2
movdqu -0x10(%rax),%xmm5 | pmovzxbw -0x8(%rdx),%xmm3
add $0x20,%rax | add $0x10,%rdx
pxor %xmm4,%xmm0 | pxor %xmm2,%xmm0
pxor %xmm5,%xmm1 | pxor %xmm3,%xmm1
aesenc %xmm0,%xmm0 aesenc %xmm0,%xmm0
aesenc %xmm1,%xmm1 aesenc %xmm1,%xmm1
aesenc %xmm0,%xmm0 aesenc %xmm0,%xmm0
aesenc %xmm1,%xmm1 aesenc %xmm1,%xmm1
The number of instructions is identical, but there are actually 2 more
uops per iteration. LLVM-MCA simulation shows this should execute in the
same number of cycles on older CPUs that do not have support for VAES
(see <https://analysis.godbolt.org/z/x95Mrfrf7>).
For the VAES version in aeshash256() and the AVX10 version in
aeshash256_256():
QStringView QLatin1StringView
vpxor -0x40(%rax),%ymm1,%ym | vpmovzxbw -0x20(%rax),%ymm3
vpxor -0x20(%rax),%ymm0,%ym | vpmovzxbw -0x10(%rax),%ymm2
add $0x40,%rax | add $0x20,%rax
| vpxor %ymm3,%ymm0,%ymm0
| vpxor %ymm2,%ymm1,%ymm1
vaesenc %ymm1,%ymm1,%ymm1 <
vaesenc %ymm0,%ymm0,%ymm0 vaesenc %ymm0,%ymm0,%ymm0
vaesenc %ymm1,%ymm1,%ymm1 vaesenc %ymm1,%ymm1,%ymm1
vaesenc %ymm0,%ymm0,%ymm0 vaesenc %ymm0,%ymm0,%ymm0
> vaesenc %ymm1,%ymm1,%ymm1
In this case, the increase in number of instructions matches the
increase in number of uops. The LLVM-MCA simulation says that the
QLatin1StringView version is faster at 11 cycles/iteration vs 14 cyc/it
(see <https://analysis.godbolt.org/z/1Gv1coz13>), but that can't be
right.
Measured performance of CPU cycles, on an Intel Core i9-7940X (Skylake,
no VAES support), normalized on the QString performance (QByteArray is
used as a stand-in for the performance in Qt 6.7):
aeshash | siphash
QByteArray QL1SV QString QByteArray QString
dictionary 94.5% 79.7% 100.0% 150.5%* 159.8%
paths-small 90.2% 93.2% 100.0% 202.8% 290.3%
uuids 81.8% 100.7% 100.0% 215.2% 350.7%
longstrings 42.5% 100.8% 100.0% 185.7% 353.2%
numbers 95.5% 77.9% 100.0% 155.3%* 164.5%
On an Intel Core i7-1165G7 (Tiger Lake, capable of VAES and AVX512VL):
aeshash | siphash
QByteArray QL1SV QString QByteArray QString
dictionary 90.0% 91.1% 100.0% 103.3%* 157.1%
paths-small 99.4% 104.8% 100.0% 237.5% 358.0%
uuids 88.5% 117.6% 100.0% 274.5% 461.7%
longstrings 57.4% 111.2% 100.0% 503.0% 974.3%
numbers 90.6% 89.7% 100.0% 98.7%* 149.9%
On an Intel 4th Generation Xeon Scalable Platinum (Sapphire Rapids, same
Golden Cove core as Alder Lake):
aeshash | siphash
QByteArray QL1SV QString QByteArray QString
dictionary 89.9% 102.1% 100.0% 158.1%* 172.7%
paths-small 78.0% 89.4% 100.0% 159.4% 258.0%
uuids 109.1% 107.9% 100.0% 279.0% 496.3%
longstrings 52.1% 112.4% 100.0% 564.4% 1078.3%
numbers 85.8% 98.9% 100.0% 152.6%* 190.4%
* dictionary contains very short entries (6 characters)
* paths-small contains strings of varying length, but very few over 32
* uuids-list contains fixed-length strings (38 characters)
* longstrings is the same but 304 characters
* numbers also a lot contains very short strings (1 to 6 chars)
What this shows:
* For short strings, the performance difference is negligible between
all three
* For longer strings, QLatin1StringView now costs between 7 and 17% more
than QString on the tested machines instead of up to ~50% less, except on
the older machine (where I think the main QString hashing is suffering
from memory bandwidth limitations)
* The AES hash implementation is anywhere from 1.6 to 11x faster than
Siphash
* Murmurhash (marked with asterisk) is much faster than Siphash, but it
only managed to beat the AES hash in one test
Change-Id: I664b9f014ffc48cbb49bfffd17b045c1811ac0ed
Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org>
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2024-02-03 06:15:56 +00:00
|
|
|
size_t qHash(const Qt4String &str, size_t /* never used */)
|
2012-02-28 19:56:14 +00:00
|
|
|
{
|
Port from container::count() and length() to size() - V5
This is a semantic patch using ClangTidyTransformator as in
qtbase/df9d882d41b741fef7c5beeddb0abe9d904443d8, but extended to
handle typedefs and accesses through pointers, too:
const std::string o = "object";
auto hasTypeIgnoringPointer = [](auto type) { return anyOf(hasType(type), hasType(pointsTo(type))); };
auto derivedFromAnyOfClasses = [&](ArrayRef<StringRef> classes) {
auto exprOfDeclaredType = [&](auto decl) {
return expr(hasTypeIgnoringPointer(hasUnqualifiedDesugaredType(recordType(hasDeclaration(decl))))).bind(o);
};
return exprOfDeclaredType(cxxRecordDecl(isSameOrDerivedFrom(hasAnyName(classes))));
};
auto renameMethod = [&] (ArrayRef<StringRef> classes,
StringRef from, StringRef to) {
return makeRule(cxxMemberCallExpr(on(derivedFromAnyOfClasses(classes)),
callee(cxxMethodDecl(hasName(from), parameterCountIs(0)))),
changeTo(cat(access(o, cat(to)), "()")),
cat("use '", to, "' instead of '", from, "'"));
};
renameMethod(<classes>, "count", "size");
renameMethod(<classes>, "length", "size");
except that the on() matcher has been replaced by one that doesn't
ignoreParens().
a.k.a qt-port-to-std-compatible-api V5 with config Scope: 'Container'.
Added two NOLINTNEXTLINEs in tst_qbitarray and tst_qcontiguouscache,
to avoid porting calls that explicitly test count().
Change-Id: Icfb8808c2ff4a30187e9935a51cad26987451c22
Reviewed-by: Ivan Solovev <ivan.solovev@qt.io>
Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org>
2022-09-30 12:09:04 +00:00
|
|
|
qsizetype n = str.size();
|
2012-02-28 19:56:14 +00:00
|
|
|
const QChar *p = str.unicode();
|
|
|
|
uint h = 0;
|
|
|
|
|
|
|
|
while (n--) {
|
|
|
|
h = (h << 4) + (*p++).unicode();
|
|
|
|
h ^= (h & 0xf0000000) >> 23;
|
|
|
|
h &= 0x0fffffff;
|
|
|
|
}
|
|
|
|
return h;
|
|
|
|
}
|
|
|
|
|
2022-09-24 11:54:42 +00:00
|
|
|
size_t qHash(const Qt50String &key, size_t seed)
|
2013-12-20 07:32:04 +00:00
|
|
|
{
|
|
|
|
const QChar *p = key.unicode();
|
2022-09-24 11:54:42 +00:00
|
|
|
qsizetype len = key.size();
|
|
|
|
size_t h = seed;
|
2013-12-20 07:32:04 +00:00
|
|
|
for (int i = 0; i < len; ++i)
|
|
|
|
h = 31 * h + p[i].unicode();
|
|
|
|
return h;
|
|
|
|
}
|
|
|
|
|
2012-01-25 22:12:43 +00:00
|
|
|
// The Java's hashing algorithm for strings is a variation of D. J. Bernstein
|
|
|
|
// hashing algorithm appeared here http://cr.yp.to/cdb/cdb.txt
|
|
|
|
// and informally known as DJB33XX - DJB's 33 Times Xor.
|
|
|
|
// Java uses DJB31XA, that is, 31 Times Add.
|
|
|
|
// The original algorithm was a loop around "(h << 5) + h ^ c",
|
|
|
|
// which is indeed "h * 33 ^ c"; it was then changed to
|
|
|
|
// "(h << 5) - h ^ c", so "h * 31 ^ c", and the XOR changed to a sum:
|
|
|
|
// "(h << 5) - h + c", which can save some assembly instructions.
|
|
|
|
// Still, we can avoid writing the multiplication as "(h << 5) - h"
|
|
|
|
// -- the compiler will turn it into a shift and an addition anyway
|
|
|
|
// (for instance, gcc 4.4 does that even at -O0).
|
qHash: implement an AES hasher for QLatin1StringView
It's the same aeshash() as before, except we're passing a template
parameter to indicate whether to read half and then zero-extend the
data. That is, it will perform a conversion from Latin1 on the fly.
When running in zero-extending mode, the length parameters are actually
doubled (counting the number of UTF-16 code units) and we then divide
again by 2 when advancing.
The implementation should have the following performance
characteristics:
* QLatin1StringView now will be roughly half as fast as Qt 6.7
* QLatin1StringView now will be roughly as fast as QStringView
For the aeshash128() in default builds of QtCore (will use SSE4.1), the
long loop (32 characters or more) is:
QStringView QLatin1StringView
movdqu -0x20(%rax),%xmm4 | pmovzxbw -0x10(%rdx),%xmm2
movdqu -0x10(%rax),%xmm5 | pmovzxbw -0x8(%rdx),%xmm3
add $0x20,%rax | add $0x10,%rdx
pxor %xmm4,%xmm0 | pxor %xmm2,%xmm0
pxor %xmm5,%xmm1 | pxor %xmm3,%xmm1
aesenc %xmm0,%xmm0 aesenc %xmm0,%xmm0
aesenc %xmm1,%xmm1 aesenc %xmm1,%xmm1
aesenc %xmm0,%xmm0 aesenc %xmm0,%xmm0
aesenc %xmm1,%xmm1 aesenc %xmm1,%xmm1
The number of instructions is identical, but there are actually 2 more
uops per iteration. LLVM-MCA simulation shows this should execute in the
same number of cycles on older CPUs that do not have support for VAES
(see <https://analysis.godbolt.org/z/x95Mrfrf7>).
For the VAES version in aeshash256() and the AVX10 version in
aeshash256_256():
QStringView QLatin1StringView
vpxor -0x40(%rax),%ymm1,%ym | vpmovzxbw -0x20(%rax),%ymm3
vpxor -0x20(%rax),%ymm0,%ym | vpmovzxbw -0x10(%rax),%ymm2
add $0x40,%rax | add $0x20,%rax
| vpxor %ymm3,%ymm0,%ymm0
| vpxor %ymm2,%ymm1,%ymm1
vaesenc %ymm1,%ymm1,%ymm1 <
vaesenc %ymm0,%ymm0,%ymm0 vaesenc %ymm0,%ymm0,%ymm0
vaesenc %ymm1,%ymm1,%ymm1 vaesenc %ymm1,%ymm1,%ymm1
vaesenc %ymm0,%ymm0,%ymm0 vaesenc %ymm0,%ymm0,%ymm0
> vaesenc %ymm1,%ymm1,%ymm1
In this case, the increase in number of instructions matches the
increase in number of uops. The LLVM-MCA simulation says that the
QLatin1StringView version is faster at 11 cycles/iteration vs 14 cyc/it
(see <https://analysis.godbolt.org/z/1Gv1coz13>), but that can't be
right.
Measured performance of CPU cycles, on an Intel Core i9-7940X (Skylake,
no VAES support), normalized on the QString performance (QByteArray is
used as a stand-in for the performance in Qt 6.7):
aeshash | siphash
QByteArray QL1SV QString QByteArray QString
dictionary 94.5% 79.7% 100.0% 150.5%* 159.8%
paths-small 90.2% 93.2% 100.0% 202.8% 290.3%
uuids 81.8% 100.7% 100.0% 215.2% 350.7%
longstrings 42.5% 100.8% 100.0% 185.7% 353.2%
numbers 95.5% 77.9% 100.0% 155.3%* 164.5%
On an Intel Core i7-1165G7 (Tiger Lake, capable of VAES and AVX512VL):
aeshash | siphash
QByteArray QL1SV QString QByteArray QString
dictionary 90.0% 91.1% 100.0% 103.3%* 157.1%
paths-small 99.4% 104.8% 100.0% 237.5% 358.0%
uuids 88.5% 117.6% 100.0% 274.5% 461.7%
longstrings 57.4% 111.2% 100.0% 503.0% 974.3%
numbers 90.6% 89.7% 100.0% 98.7%* 149.9%
On an Intel 4th Generation Xeon Scalable Platinum (Sapphire Rapids, same
Golden Cove core as Alder Lake):
aeshash | siphash
QByteArray QL1SV QString QByteArray QString
dictionary 89.9% 102.1% 100.0% 158.1%* 172.7%
paths-small 78.0% 89.4% 100.0% 159.4% 258.0%
uuids 109.1% 107.9% 100.0% 279.0% 496.3%
longstrings 52.1% 112.4% 100.0% 564.4% 1078.3%
numbers 85.8% 98.9% 100.0% 152.6%* 190.4%
* dictionary contains very short entries (6 characters)
* paths-small contains strings of varying length, but very few over 32
* uuids-list contains fixed-length strings (38 characters)
* longstrings is the same but 304 characters
* numbers also a lot contains very short strings (1 to 6 chars)
What this shows:
* For short strings, the performance difference is negligible between
all three
* For longer strings, QLatin1StringView now costs between 7 and 17% more
than QString on the tested machines instead of up to ~50% less, except on
the older machine (where I think the main QString hashing is suffering
from memory bandwidth limitations)
* The AES hash implementation is anywhere from 1.6 to 11x faster than
Siphash
* Murmurhash (marked with asterisk) is much faster than Siphash, but it
only managed to beat the AES hash in one test
Change-Id: I664b9f014ffc48cbb49bfffd17b045c1811ac0ed
Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org>
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
2024-02-03 06:15:56 +00:00
|
|
|
size_t qHash(const JavaString &str, size_t /* never used */)
|
2012-01-18 16:02:24 +00:00
|
|
|
{
|
2023-01-29 09:12:12 +00:00
|
|
|
const auto *p = reinterpret_cast<const char16_t *>(str.constData());
|
2022-09-24 11:54:42 +00:00
|
|
|
const qsizetype len = str.size();
|
2012-01-18 16:02:24 +00:00
|
|
|
|
|
|
|
uint h = 0;
|
|
|
|
|
|
|
|
for (int i = 0; i < len; ++i)
|
|
|
|
h = 31 * h + p[i];
|
|
|
|
|
|
|
|
return h;
|
|
|
|
}
|
|
|
|
|
2011-04-27 10:05:43 +00:00
|
|
|
QT_END_NAMESPACE
|