malloc: Improve performance of __libc_malloc

Improve performance of __libc_malloc by splitting it into 2 parts: first handle
the tcache fastpath, then do the rest in a separate tailcalled function.
This results in significant performance gains since __libc_malloc doesn't need
to setup a frame and we delay tcache initialization and setting of errno until
later.

On Neoverse V2, bench-malloc-simple improves by 6.7% overall (up to 8.5% for
ST case) and bench-malloc-thread improves by 20.3% for 1 thread and 14.4% for
32 threads.

Reviewed-by: DJ Delorie <dj@redhat.com>
This commit is contained in:
Wilco Dijkstra 2025-03-20 20:04:14 +00:00
parent 0a8e7ac95c
commit b0897944cc
1 changed files with 20 additions and 13 deletions

View File

@ -1325,6 +1325,9 @@ nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
static __always_inline size_t
checked_request2size (size_t req) __nonnull (1)
{
_Static_assert (PTRDIFF_MAX <= SIZE_MAX / 2,
"PTRDIFF_MAX is not more than half of SIZE_MAX");
if (__glibc_unlikely (req > PTRDIFF_MAX))
return 0;
@ -3380,26 +3383,17 @@ tcache_thread_shutdown (void)
#endif /* !USE_TCACHE */
#if IS_IN (libc)
void *
__libc_malloc (size_t bytes)
static void * __attribute_noinline__
__libc_malloc2 (size_t bytes)
{
mstate ar_ptr;
void *victim;
_Static_assert (PTRDIFF_MAX <= SIZE_MAX / 2,
"PTRDIFF_MAX is not more than half of SIZE_MAX");
if (!__malloc_initialized)
ptmalloc_init ();
#if USE_TCACHE
bool err = tcache_try_malloc (bytes, &victim);
if (err)
return NULL;
if (victim)
return tag_new_usable (victim);
#endif
MAYBE_INIT_TCACHE ();
if (SINGLE_THREAD_P)
{
@ -3430,6 +3424,19 @@ __libc_malloc (size_t bytes)
ar_ptr == arena_for_chunk (mem2chunk (victim)));
return victim;
}
void *
__libc_malloc (size_t bytes)
{
#if USE_TCACHE
size_t tc_idx = csize2tidx (checked_request2size (bytes));
if (tcache_available (tc_idx))
return tag_new_usable (tcache_get (tc_idx));
#endif
return __libc_malloc2 (bytes);
}
libc_hidden_def (__libc_malloc)
void