lib/crypto: chacha: Consolidate into single module
Consolidate the ChaCha code into a single module (excluding
chacha-block-generic.c which remains always built-in for random.c),
similar to various other algorithms:
- Each arch now provides a header file lib/crypto/$(SRCARCH)/chacha.h,
replacing lib/crypto/$(SRCARCH)/chacha*.c. The header defines
chacha_crypt_arch() and hchacha_block_arch(). It is included by
lib/crypto/chacha.c, and thus the code gets built into the single
libchacha module, with improved inlining in some cases.
- Whether arch-optimized ChaCha is buildable is now controlled centrally
by lib/crypto/Kconfig instead of by lib/crypto/$(SRCARCH)/Kconfig.
The conditions for enabling it remain the same as before, and it
remains enabled by default.
- Any additional arch-specific translation units for the optimized
ChaCha code, such as assembly files, are now compiled by
lib/crypto/Makefile instead of lib/crypto/$(SRCARCH)/Makefile.
This removes the last use for the Makefile and Kconfig files in the
arm64, mips, powerpc, riscv, and s390 subdirectories of lib/crypto/. So
also remove those files and the references to them.
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250827151131.27733-7-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
2025-08-27 15:11:25 +00:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
2019-11-08 12:22:14 +00:00
|
|
|
/*
|
2025-04-05 18:26:03 +00:00
|
|
|
* ChaCha and HChaCha functions (ARM optimized)
|
2019-11-08 12:22:14 +00:00
|
|
|
*
|
|
|
|
|
* Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
|
|
|
|
* Copyright (C) 2015 Martin Willi
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include <crypto/internal/simd.h>
|
2019-11-08 12:22:15 +00:00
|
|
|
#include <linux/jump_label.h>
|
2019-11-08 12:22:14 +00:00
|
|
|
#include <linux/kernel.h>
|
|
|
|
|
|
|
|
|
|
#include <asm/cputype.h>
|
|
|
|
|
#include <asm/hwcap.h>
|
|
|
|
|
#include <asm/neon.h>
|
|
|
|
|
#include <asm/simd.h>
|
|
|
|
|
|
2025-05-05 18:18:21 +00:00
|
|
|
asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
|
|
|
|
|
u8 *dst, const u8 *src, int nrounds);
|
|
|
|
|
asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
|
|
|
|
|
u8 *dst, const u8 *src,
|
crypto: arm/chacha-neon - optimize for non-block size multiples
The current NEON based ChaCha implementation for ARM is optimized for
multiples of 4x the ChaCha block size (64 bytes). This makes sense for
block encryption, but given that ChaCha is also often used in the
context of networking, it makes sense to consider arbitrary length
inputs as well.
For example, WireGuard typically uses 1420 byte packets, and performing
ChaCha encryption involves 5 invocations of chacha_4block_xor_neon()
and 3 invocations of chacha_block_xor_neon(), where the last one also
involves a memcpy() using a buffer on the stack to process the final
chunk of 1420 % 64 == 12 bytes.
Let's optimize for this case as well, by letting chacha_4block_xor_neon()
deal with any input size between 64 and 256 bytes, using NEON permutation
instructions and overlapping loads and stores. This way, the 140 byte
tail of a 1420 byte input buffer can simply be processed in one go.
This results in the following performance improvements for 1420 byte
blocks, without significant impact on power-of-2 input sizes. (Note
that Raspberry Pi is widely used in combination with a 32-bit kernel,
even though the core is 64-bit capable)
Cortex-A8 (BeagleBone) : 7%
Cortex-A15 (Calxeda Midway) : 21%
Cortex-A53 (Raspberry Pi 3) : 3%
Cortex-A72 (Raspberry Pi 4) : 19%
Cc: Eric Biggers <ebiggers@google.com>
Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-11-03 16:28:09 +00:00
|
|
|
int nrounds, unsigned int nbytes);
|
2025-05-05 18:18:21 +00:00
|
|
|
asmlinkage void hchacha_block_arm(const struct chacha_state *state,
|
2025-05-05 18:18:24 +00:00
|
|
|
u32 out[HCHACHA_OUT_WORDS], int nrounds);
|
2025-05-05 18:18:21 +00:00
|
|
|
asmlinkage void hchacha_block_neon(const struct chacha_state *state,
|
2025-05-05 18:18:24 +00:00
|
|
|
u32 out[HCHACHA_OUT_WORDS], int nrounds);
|
2019-11-08 12:22:14 +00:00
|
|
|
|
|
|
|
|
asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
|
2025-05-05 18:18:21 +00:00
|
|
|
const struct chacha_state *state, int nrounds);
|
2019-11-08 12:22:14 +00:00
|
|
|
|
2019-11-08 12:22:15 +00:00
|
|
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
|
|
|
|
|
|
2019-11-08 12:22:14 +00:00
|
|
|
static inline bool neon_usable(void)
|
|
|
|
|
{
|
2019-11-08 12:22:15 +00:00
|
|
|
return static_branch_likely(&use_neon) && crypto_simd_usable();
|
2019-11-08 12:22:14 +00:00
|
|
|
}
|
|
|
|
|
|
2025-05-05 18:18:21 +00:00
|
|
|
static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
|
2019-11-08 12:22:14 +00:00
|
|
|
unsigned int bytes, int nrounds)
|
|
|
|
|
{
|
|
|
|
|
u8 buf[CHACHA_BLOCK_SIZE];
|
|
|
|
|
|
crypto: arm/chacha-neon - optimize for non-block size multiples
The current NEON based ChaCha implementation for ARM is optimized for
multiples of 4x the ChaCha block size (64 bytes). This makes sense for
block encryption, but given that ChaCha is also often used in the
context of networking, it makes sense to consider arbitrary length
inputs as well.
For example, WireGuard typically uses 1420 byte packets, and performing
ChaCha encryption involves 5 invocations of chacha_4block_xor_neon()
and 3 invocations of chacha_block_xor_neon(), where the last one also
involves a memcpy() using a buffer on the stack to process the final
chunk of 1420 % 64 == 12 bytes.
Let's optimize for this case as well, by letting chacha_4block_xor_neon()
deal with any input size between 64 and 256 bytes, using NEON permutation
instructions and overlapping loads and stores. This way, the 140 byte
tail of a 1420 byte input buffer can simply be processed in one go.
This results in the following performance improvements for 1420 byte
blocks, without significant impact on power-of-2 input sizes. (Note
that Raspberry Pi is widely used in combination with a 32-bit kernel,
even though the core is 64-bit capable)
Cortex-A8 (BeagleBone) : 7%
Cortex-A15 (Calxeda Midway) : 21%
Cortex-A53 (Raspberry Pi 3) : 3%
Cortex-A72 (Raspberry Pi 4) : 19%
Cc: Eric Biggers <ebiggers@google.com>
Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-11-03 16:28:09 +00:00
|
|
|
while (bytes > CHACHA_BLOCK_SIZE) {
|
|
|
|
|
unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
|
|
|
|
|
|
|
|
|
|
chacha_4block_xor_neon(state, dst, src, nrounds, l);
|
|
|
|
|
bytes -= l;
|
|
|
|
|
src += l;
|
|
|
|
|
dst += l;
|
2025-05-05 18:18:21 +00:00
|
|
|
state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
|
2019-11-08 12:22:14 +00:00
|
|
|
}
|
|
|
|
|
if (bytes) {
|
crypto: arm/chacha-neon - optimize for non-block size multiples
The current NEON based ChaCha implementation for ARM is optimized for
multiples of 4x the ChaCha block size (64 bytes). This makes sense for
block encryption, but given that ChaCha is also often used in the
context of networking, it makes sense to consider arbitrary length
inputs as well.
For example, WireGuard typically uses 1420 byte packets, and performing
ChaCha encryption involves 5 invocations of chacha_4block_xor_neon()
and 3 invocations of chacha_block_xor_neon(), where the last one also
involves a memcpy() using a buffer on the stack to process the final
chunk of 1420 % 64 == 12 bytes.
Let's optimize for this case as well, by letting chacha_4block_xor_neon()
deal with any input size between 64 and 256 bytes, using NEON permutation
instructions and overlapping loads and stores. This way, the 140 byte
tail of a 1420 byte input buffer can simply be processed in one go.
This results in the following performance improvements for 1420 byte
blocks, without significant impact on power-of-2 input sizes. (Note
that Raspberry Pi is widely used in combination with a 32-bit kernel,
even though the core is 64-bit capable)
Cortex-A8 (BeagleBone) : 7%
Cortex-A15 (Calxeda Midway) : 21%
Cortex-A53 (Raspberry Pi 3) : 3%
Cortex-A72 (Raspberry Pi 4) : 19%
Cc: Eric Biggers <ebiggers@google.com>
Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-11-03 16:28:09 +00:00
|
|
|
const u8 *s = src;
|
|
|
|
|
u8 *d = dst;
|
|
|
|
|
|
|
|
|
|
if (bytes != CHACHA_BLOCK_SIZE)
|
|
|
|
|
s = d = memcpy(buf, src, bytes);
|
|
|
|
|
chacha_block_xor_neon(state, d, s, nrounds);
|
|
|
|
|
if (d != dst)
|
|
|
|
|
memcpy(dst, buf, bytes);
|
2025-05-05 18:18:21 +00:00
|
|
|
state->x[12]++;
|
2019-11-08 12:22:14 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
lib/crypto: chacha: Consolidate into single module
Consolidate the ChaCha code into a single module (excluding
chacha-block-generic.c which remains always built-in for random.c),
similar to various other algorithms:
- Each arch now provides a header file lib/crypto/$(SRCARCH)/chacha.h,
replacing lib/crypto/$(SRCARCH)/chacha*.c. The header defines
chacha_crypt_arch() and hchacha_block_arch(). It is included by
lib/crypto/chacha.c, and thus the code gets built into the single
libchacha module, with improved inlining in some cases.
- Whether arch-optimized ChaCha is buildable is now controlled centrally
by lib/crypto/Kconfig instead of by lib/crypto/$(SRCARCH)/Kconfig.
The conditions for enabling it remain the same as before, and it
remains enabled by default.
- Any additional arch-specific translation units for the optimized
ChaCha code, such as assembly files, are now compiled by
lib/crypto/Makefile instead of lib/crypto/$(SRCARCH)/Makefile.
This removes the last use for the Makefile and Kconfig files in the
arm64, mips, powerpc, riscv, and s390 subdirectories of lib/crypto/. So
also remove those files and the references to them.
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250827151131.27733-7-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
2025-08-27 15:11:25 +00:00
|
|
|
static void hchacha_block_arch(const struct chacha_state *state,
|
|
|
|
|
u32 out[HCHACHA_OUT_WORDS], int nrounds)
|
2019-11-08 12:22:15 +00:00
|
|
|
{
|
|
|
|
|
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
|
2025-05-05 18:18:24 +00:00
|
|
|
hchacha_block_arm(state, out, nrounds);
|
2019-11-08 12:22:15 +00:00
|
|
|
} else {
|
|
|
|
|
kernel_neon_begin();
|
2025-05-05 18:18:24 +00:00
|
|
|
hchacha_block_neon(state, out, nrounds);
|
2019-11-08 12:22:15 +00:00
|
|
|
kernel_neon_end();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
lib/crypto: chacha: Consolidate into single module
Consolidate the ChaCha code into a single module (excluding
chacha-block-generic.c which remains always built-in for random.c),
similar to various other algorithms:
- Each arch now provides a header file lib/crypto/$(SRCARCH)/chacha.h,
replacing lib/crypto/$(SRCARCH)/chacha*.c. The header defines
chacha_crypt_arch() and hchacha_block_arch(). It is included by
lib/crypto/chacha.c, and thus the code gets built into the single
libchacha module, with improved inlining in some cases.
- Whether arch-optimized ChaCha is buildable is now controlled centrally
by lib/crypto/Kconfig instead of by lib/crypto/$(SRCARCH)/Kconfig.
The conditions for enabling it remain the same as before, and it
remains enabled by default.
- Any additional arch-specific translation units for the optimized
ChaCha code, such as assembly files, are now compiled by
lib/crypto/Makefile instead of lib/crypto/$(SRCARCH)/Makefile.
This removes the last use for the Makefile and Kconfig files in the
arm64, mips, powerpc, riscv, and s390 subdirectories of lib/crypto/. So
also remove those files and the references to them.
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250827151131.27733-7-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
2025-08-27 15:11:25 +00:00
|
|
|
static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
|
|
|
|
|
const u8 *src, unsigned int bytes, int nrounds)
|
2019-11-08 12:22:15 +00:00
|
|
|
{
|
|
|
|
|
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
|
|
|
|
|
bytes <= CHACHA_BLOCK_SIZE) {
|
|
|
|
|
chacha_doarm(dst, src, bytes, state, nrounds);
|
2025-05-05 18:18:21 +00:00
|
|
|
state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
|
2019-11-08 12:22:15 +00:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-22 23:18:53 +00:00
|
|
|
do {
|
|
|
|
|
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
|
|
|
|
|
|
|
|
|
|
kernel_neon_begin();
|
|
|
|
|
chacha_doneon(state, dst, src, todo, nrounds);
|
|
|
|
|
kernel_neon_end();
|
|
|
|
|
|
|
|
|
|
bytes -= todo;
|
|
|
|
|
src += todo;
|
|
|
|
|
dst += todo;
|
|
|
|
|
} while (bytes);
|
2019-11-08 12:22:15 +00:00
|
|
|
}
|
|
|
|
|
|
lib/crypto: chacha: Consolidate into single module
Consolidate the ChaCha code into a single module (excluding
chacha-block-generic.c which remains always built-in for random.c),
similar to various other algorithms:
- Each arch now provides a header file lib/crypto/$(SRCARCH)/chacha.h,
replacing lib/crypto/$(SRCARCH)/chacha*.c. The header defines
chacha_crypt_arch() and hchacha_block_arch(). It is included by
lib/crypto/chacha.c, and thus the code gets built into the single
libchacha module, with improved inlining in some cases.
- Whether arch-optimized ChaCha is buildable is now controlled centrally
by lib/crypto/Kconfig instead of by lib/crypto/$(SRCARCH)/Kconfig.
The conditions for enabling it remain the same as before, and it
remains enabled by default.
- Any additional arch-specific translation units for the optimized
ChaCha code, such as assembly files, are now compiled by
lib/crypto/Makefile instead of lib/crypto/$(SRCARCH)/Makefile.
This removes the last use for the Makefile and Kconfig files in the
arm64, mips, powerpc, riscv, and s390 subdirectories of lib/crypto/. So
also remove those files and the references to them.
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250827151131.27733-7-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
2025-08-27 15:11:25 +00:00
|
|
|
#define chacha_mod_init_arch chacha_mod_init_arch
|
|
|
|
|
static void chacha_mod_init_arch(void)
|
2019-11-08 12:22:14 +00:00
|
|
|
{
|
|
|
|
|
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
|
|
|
|
|
switch (read_cpuid_part()) {
|
|
|
|
|
case ARM_CPU_PART_CORTEX_A7:
|
|
|
|
|
case ARM_CPU_PART_CORTEX_A5:
|
|
|
|
|
/*
|
|
|
|
|
* The Cortex-A7 and Cortex-A5 do not perform well with
|
|
|
|
|
* the NEON implementation but do incredibly with the
|
|
|
|
|
* scalar one and use less power.
|
|
|
|
|
*/
|
|
|
|
|
break;
|
2019-11-08 12:22:15 +00:00
|
|
|
default:
|
|
|
|
|
static_branch_enable(&use_neon);
|
2019-11-08 12:22:14 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|