Use compiler-builtins for the memcpy family of functions

2025-12-14 16:38:04 +01:00 · 2025-12-14 16:38:04 +01:00 · 62a572a0f0
parent c089667ade
commit 62a572a0f0
3 changed files with 1 additions and 177 deletions
--- a/2
+++ b/2
@ -32,7 +32,7 @@ $(BUILD)/kernel.all: $(LD_SCRIPT) $(LOCKFILE) $(MANIFEST) $(TARGET_SPEC) $(shell
 		--manifest-path "$(MANIFEST)" \
 		--target "$(TARGET_SPEC)" \
 		--release \
-		-Z build-std=core,alloc \
+		-Z build-std=core,alloc -Zbuild-std-features=compiler-builtins-mem \
 		-- \
 		-C link-arg=-T -Clink-arg="$(LD_SCRIPT)" \
 		-C link-arg=-z -Clink-arg=max-page-size=0x1000 \
--- a/src/externs.rs
+++ b/src/externs.rs
@ -1,172 +0,0 @@
-use core::mem;
-
-const WORD_SIZE: usize = mem::size_of::<usize>();
-
-/// Memcpy
-///
-/// Copy N bytes of memory from one location to another.
-///
-/// This faster implementation works by copying bytes not one-by-one, but in
-/// groups of 8 bytes (or 4 bytes in the case of 32-bit architectures).
-#[unsafe(no_mangle)]
-pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, len: usize) -> *mut u8 {
-    unsafe {
-        // TODO: Alignment? Some sources claim that even on relatively modern µ-arches, unaligned
-        // accesses spanning two pages, can take dozens of cycles. That means chunk-based memcpy can
-        // even be slower for small lengths if alignment is not taken into account.
-        //
-        // TODO: Optimize out smaller loops by first checking if len < WORD_SIZE, and possibly if
-        // dest + WORD_SIZE spans two pages, then doing one unaligned copy, then aligning up, and then
-        // doing one last unaligned copy?
-        //
-        // TODO: While we use the -fno-builtin equivalent, can we guarantee LLVM won't insert memcpy
-        // call inside here? Maybe write it in assembly?
-
-        let mut i = 0_usize;
-
-        // First we copy len / WORD_SIZE chunks...
-
-        let chunks = len / WORD_SIZE;
-
-        while i < chunks * WORD_SIZE {
-            dest.add(i)
-                .cast::<usize>()
-                .write_unaligned(src.add(i).cast::<usize>().read_unaligned());
-            i += WORD_SIZE;
-        }
-
-        // .. then we copy len % WORD_SIZE bytes
-        while i < len {
-            dest.add(i).write(src.add(i).read());
-            i += 1;
-        }
-
-        dest
-    }
-}
-
-/// Memmove
-///
-/// Copy N bytes of memory from src to dest. The memory areas may overlap.
-///
-/// This faster implementation works by copying bytes not one-by-one, but in
-/// groups of 8 bytes (or 4 bytes in the case of 32-bit architectures).
-#[unsafe(no_mangle)]
-pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, len: usize) -> *mut u8 {
-    unsafe {
-        let chunks = len / WORD_SIZE;
-
-        // TODO: also require dest - src < len before choosing to copy backwards?
-        if src < dest as *const u8 {
-            // We have to copy backwards if copying upwards.
-
-            let mut i = len;
-
-            while i != chunks * WORD_SIZE {
-                i -= 1;
-                dest.add(i).write(src.add(i).read());
-            }
-
-            while i > 0 {
-                i -= WORD_SIZE;
-
-                dest.add(i)
-                    .cast::<usize>()
-                    .write_unaligned(src.add(i).cast::<usize>().read_unaligned());
-            }
-        } else {
-            // We have to copy forward if copying downwards.
-
-            let mut i = 0_usize;
-
-            while i < chunks * WORD_SIZE {
-                dest.add(i)
-                    .cast::<usize>()
-                    .write_unaligned(src.add(i).cast::<usize>().read_unaligned());
-
-                i += WORD_SIZE;
-            }
-
-            while i < len {
-                dest.add(i).write(src.add(i).read());
-                i += 1;
-            }
-        }
-
-        dest
-    }
-}
-
-/// Memset
-///
-/// Fill a block of memory with a specified value.
-///
-/// This faster implementation works by setting bytes not one-by-one, but in
-/// groups of 8 bytes (or 4 bytes in the case of 32-bit architectures).
-#[unsafe(no_mangle)]
-pub unsafe extern "C" fn memset(dest: *mut u8, byte: i32, len: usize) -> *mut u8 {
-    unsafe {
-        let byte = byte as u8;
-
-        let mut i = 0;
-
-        let broadcasted = usize::from_ne_bytes([byte; WORD_SIZE]);
-        let chunks = len / WORD_SIZE;
-
-        while i < chunks * WORD_SIZE {
-            dest.add(i).cast::<usize>().write_unaligned(broadcasted);
-            i += WORD_SIZE;
-        }
-
-        while i < len {
-            dest.add(i).write(byte);
-            i += 1;
-        }
-
-        dest
-    }
-}
-
-/// Memcmp
-///
-/// Compare two blocks of memory.
-///
-/// This faster implementation works by comparing bytes not one-by-one, but in
-/// groups of 8 bytes (or 4 bytes in the case of 32-bit architectures).
-#[unsafe(no_mangle)]
-pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, len: usize) -> i32 {
-    unsafe {
-        let mut i = 0_usize;
-
-        // First compare WORD_SIZE chunks...
-        let chunks = len / WORD_SIZE;
-
-        while i < chunks * WORD_SIZE {
-            let a = s1.add(i).cast::<usize>().read_unaligned();
-            let b = s2.add(i).cast::<usize>().read_unaligned();
-
-            if a != b {
-                // x86 has had bswap since the 80486, and the compiler will likely use the faster
-                // movbe. AArch64 has the REV instruction, which I think is universally available.
-                let diff = usize::from_be(a).wrapping_sub(usize::from_be(b)) as isize;
-
-                // TODO: If chunk size == 32 bits, diff can be returned directly.
-                return diff.signum() as i32;
-            }
-            i += WORD_SIZE;
-        }
-
-        // ... and then compare bytes.
-        while i < len {
-            let a = s1.add(i).read();
-            let b = s2.add(i).read();
-
-            if a != b {
-                return i32::from(a) - i32::from(b);
-            }
-            i += 1;
-        }
-
-        0
-    }
-}
--- a/src/main.rs
+++ b/src/main.rs
@ -102,10 +102,6 @@ mod devices;
 /// Event handling
 mod event;

-/// External functions
-#[cfg(not(test))]
-mod externs;
-
 /// Logging
 mod log;