mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-09-29 08:11:13 +00:00
AK+LibC: Always use REP MOVSB/STOSB for memcpy()/memset()
There's no great advantage to using MMX instructions here on modern processors, since REP MOVSB/STOSB are optimized in microcode anyway and tend to run much faster than MMX/SSE/AVX variants. This also makes it much easier to implement high-level emulation of memcpy/memset in UserspaceEmulator once we get there. :^)
This commit is contained in:
parent
272dbb82ff
commit
1366557094
Notes:
sideshowbarker
2024-07-19 04:34:11 +09:00
Author: https://github.com/awesomekling Commit: https://github.com/SerenityOS/serenity/commit/13665570947
14
AK/Memory.h
14
AK/Memory.h
|
@ -35,23 +35,11 @@
|
|||
# include <string.h>
|
||||
#endif
|
||||
|
||||
#if defined(__serenity__) && !defined(KERNEL)
|
||||
extern "C" void* mmx_memcpy(void* to, const void* from, size_t);
|
||||
#endif
|
||||
|
||||
ALWAYS_INLINE void fast_u32_copy(u32* dest, const u32* src, size_t count)
|
||||
{
|
||||
#if defined(__serenity__) && !defined(KERNEL)
|
||||
if (count >= 256) {
|
||||
mmx_memcpy(dest, src, count * sizeof(count));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
asm volatile(
|
||||
"rep movsl\n"
|
||||
: "=S"(src), "=D"(dest), "=c"(count)
|
||||
: "S"(src), "D"(dest), "c"(count)
|
||||
: "memory");
|
||||
: "+S"(src), "+D"(dest), "+c"(count)::"memory");
|
||||
}
|
||||
|
||||
ALWAYS_INLINE void fast_u32_fill(u32* dest, u32 value, size_t count)
|
||||
|
|
|
@ -139,103 +139,24 @@ int memcmp(const void* v1, const void* v2, size_t n)
|
|||
}
|
||||
|
||||
#if ARCH(I386)
|
||||
void* mmx_memcpy(void* dest, const void* src, size_t len)
|
||||
{
|
||||
ASSERT(len >= 1024);
|
||||
|
||||
auto* dest_ptr = (u8*)dest;
|
||||
auto* src_ptr = (const u8*)src;
|
||||
|
||||
if ((u32)dest_ptr & 7) {
|
||||
u32 prologue = 8 - ((u32)dest_ptr & 7);
|
||||
len -= prologue;
|
||||
asm volatile(
|
||||
"rep movsb\n"
|
||||
: "=S"(src_ptr), "=D"(dest_ptr), "=c"(prologue)
|
||||
: "0"(src_ptr), "1"(dest_ptr), "2"(prologue)
|
||||
: "memory");
|
||||
}
|
||||
for (u32 i = len / 64; i; --i) {
|
||||
asm volatile(
|
||||
"movq (%0), %%mm0\n"
|
||||
"movq 8(%0), %%mm1\n"
|
||||
"movq 16(%0), %%mm2\n"
|
||||
"movq 24(%0), %%mm3\n"
|
||||
"movq 32(%0), %%mm4\n"
|
||||
"movq 40(%0), %%mm5\n"
|
||||
"movq 48(%0), %%mm6\n"
|
||||
"movq 56(%0), %%mm7\n"
|
||||
"movq %%mm0, (%1)\n"
|
||||
"movq %%mm1, 8(%1)\n"
|
||||
"movq %%mm2, 16(%1)\n"
|
||||
"movq %%mm3, 24(%1)\n"
|
||||
"movq %%mm4, 32(%1)\n"
|
||||
"movq %%mm5, 40(%1)\n"
|
||||
"movq %%mm6, 48(%1)\n"
|
||||
"movq %%mm7, 56(%1)\n" ::"r"(src_ptr),
|
||||
"r"(dest_ptr)
|
||||
: "memory");
|
||||
src_ptr += 64;
|
||||
dest_ptr += 64;
|
||||
}
|
||||
asm volatile("emms" ::
|
||||
: "memory");
|
||||
// Whatever remains we'll have to memcpy.
|
||||
len %= 64;
|
||||
if (len)
|
||||
memcpy(dest_ptr, src_ptr, len);
|
||||
return dest;
|
||||
}
|
||||
|
||||
void* memcpy(void* dest_ptr, const void* src_ptr, size_t n)
|
||||
{
|
||||
if (n >= 1024)
|
||||
return mmx_memcpy(dest_ptr, src_ptr, n);
|
||||
|
||||
u32 dest = (u32)dest_ptr;
|
||||
u32 src = (u32)src_ptr;
|
||||
// FIXME: Support starting at an unaligned address.
|
||||
if (!(dest & 0x3) && !(src & 0x3) && n >= 12) {
|
||||
size_t u32s = n / sizeof(u32);
|
||||
asm volatile(
|
||||
"rep movsl\n"
|
||||
: "=S"(src), "=D"(dest)
|
||||
: "S"(src), "D"(dest), "c"(u32s)
|
||||
: "memory");
|
||||
n -= u32s * sizeof(u32);
|
||||
if (n == 0)
|
||||
return dest_ptr;
|
||||
}
|
||||
void* original_dest = dest_ptr;
|
||||
asm volatile(
|
||||
"rep movsb\n" ::"S"(src), "D"(dest), "c"(n)
|
||||
: "memory");
|
||||
return dest_ptr;
|
||||
"rep movsb"
|
||||
: "+D"(dest_ptr), "+S"(src_ptr), "+c"(n)::"memory");
|
||||
return original_dest;
|
||||
}
|
||||
|
||||
void* memset(void* dest_ptr, int c, size_t n)
|
||||
{
|
||||
u32 dest = (u32)dest_ptr;
|
||||
// FIXME: Support starting at an unaligned address.
|
||||
if (!(dest & 0x3) && n >= 12) {
|
||||
size_t u32s = n / sizeof(u32);
|
||||
u32 expanded_c = (u8)c;
|
||||
expanded_c |= expanded_c << 8;
|
||||
expanded_c |= expanded_c << 16;
|
||||
asm volatile(
|
||||
"rep stosl\n"
|
||||
: "=D"(dest)
|
||||
: "D"(dest), "c"(u32s), "a"(expanded_c)
|
||||
: "memory");
|
||||
n -= u32s * sizeof(u32);
|
||||
if (n == 0)
|
||||
return dest_ptr;
|
||||
}
|
||||
void* original_dest = dest_ptr;
|
||||
asm volatile(
|
||||
"rep stosb\n"
|
||||
: "=D"(dest), "=c"(n)
|
||||
: "0"(dest), "1"(n), "a"(c)
|
||||
: "=D"(dest_ptr), "=c"(n)
|
||||
: "0"(dest_ptr), "1"(n), "a"(c)
|
||||
: "memory");
|
||||
return dest_ptr;
|
||||
return original_dest;
|
||||
}
|
||||
#else
|
||||
void* memcpy(void* dest_ptr, const void* src_ptr, size_t n)
|
||||
|
|
Loading…
Reference in a new issue