Kernel: Clean up sys$futex and add support for cross-process futexes

This commit is contained in:
Idan Horowitz 2022-07-14 01:25:35 +03:00 committed by Andreas Kling
parent 55c7496200
commit 9db10887a1
Notes: sideshowbarker 2024-07-17 08:42:33 +09:00
9 changed files with 164 additions and 70 deletions

View file

@ -53,7 +53,8 @@ extern "C" {
#define FUTEX_WAKE_BITSET 10
#define FUTEX_CLOCK_REALTIME (1 << 8)
#define FUTEX_CMD_MASK ~(FUTEX_CLOCK_REALTIME)
#define FUTEX_PRIVATE_FLAG (1 << 9)
#define FUTEX_CMD_MASK ~(FUTEX_CLOCK_REALTIME | FUTEX_PRIVATE_FLAG)
#define FUTEX_BITSET_MATCH_ANY 0xffffffff

View file

@ -82,7 +82,22 @@ enum class VeilState {
Locked,
};
using FutexQueues = HashMap<FlatPtr, NonnullRefPtr<FutexQueue>>;
static constexpr FlatPtr futex_key_private_flag = 0b1;
union GlobalFutexKey {
struct {
Memory::VMObject const* vmobject;
FlatPtr offset;
} shared;
struct {
Memory::AddressSpace const* address_space;
FlatPtr user_address;
} private_;
struct {
FlatPtr parent;
FlatPtr offset;
} raw;
};
static_assert(sizeof(GlobalFutexKey) == (sizeof(FlatPtr) * 2));
struct LoadResult;
@ -575,6 +590,8 @@ private:
void clear_signal_handlers_for_exec();
void clear_futex_queues_on_exec();
ErrorOr<GlobalFutexKey> get_futex_key(FlatPtr user_address, bool shared);
ErrorOr<void> remap_range_as_stack(FlatPtr address, size_t size);
ErrorOr<FlatPtr> read_impl(int fd, Userspace<u8*> buffer, size_t size);
@ -829,9 +846,6 @@ private:
OwnPtr<PerformanceEventBuffer> m_perf_event_buffer;
FutexQueues m_futex_queues;
Spinlock m_futex_lock;
// This member is used in the implementation of ptrace's PT_TRACEME flag.
// If it is set to true, the process will stop at the next execve syscall
// and wait for a tracer to attach.
@ -1040,3 +1054,11 @@ struct AK::Formatter<Kernel::Process> : AK::Formatter<FormatString> {
return AK::Formatter<FormatString>::format(builder, "{}({})"sv, value.name(), value.pid().value());
}
};
namespace AK {
template<>
struct Traits<Kernel::GlobalFutexKey> : public GenericTraits<Kernel::GlobalFutexKey> {
static unsigned hash(Kernel::GlobalFutexKey const& futex_key) { return pair_int_hash(ptr_hash(futex_key.raw.parent), ptr_hash(futex_key.raw.offset)); }
static bool equals(Kernel::GlobalFutexKey const& a, Kernel::GlobalFutexKey const& b) { return a.raw.parent == b.raw.parent && a.raw.offset == b.raw.offset; }
};
};

View file

@ -1,25 +1,89 @@
/*
* Copyright (c) 2018-2021, Andreas Kling <kling@serenityos.org>
* Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Singleton.h>
#include <Kernel/Debug.h>
#include <Kernel/Memory/InodeVMObject.h>
#include <Kernel/Memory/MemoryManager.h>
#include <Kernel/Process.h>
namespace Kernel {
static Singleton<SpinlockProtected<HashMap<GlobalFutexKey, NonnullRefPtr<FutexQueue>>>> s_global_futex_queues;
void Process::clear_futex_queues_on_exec()
{
SpinlockLocker lock(m_futex_lock);
for (auto& it : m_futex_queues) {
bool did_wake_all;
it.value->wake_all(did_wake_all);
VERIFY(did_wake_all); // No one should be left behind...
s_global_futex_queues->with([this](auto& queues) {
auto const* address_space = &this->address_space();
queues.remove_all_matching([address_space](auto& futex_key, auto& futex_queue) {
if ((futex_key.raw.offset & futex_key_private_flag) == 0)
return false;
if (futex_key.private_.address_space != address_space)
return false;
bool did_wake_all;
futex_queue->wake_all(did_wake_all);
VERIFY(did_wake_all); // No one should be left behind...
return true;
});
});
}
ErrorOr<GlobalFutexKey> Process::get_futex_key(FlatPtr user_address, bool shared)
{
if (user_address & 0b11) // user_address points to a u32, so must be 4byte aligned
return EINVAL;
auto range = Memory::VirtualRange { VirtualAddress(user_address), sizeof(u32) };
if (!Kernel::Memory::is_user_range(range))
return EFAULT;
if (!shared) { // If this is thread-shared, we can skip searching the matching region
return GlobalFutexKey {
.private_ = {
.address_space = &address_space(),
.user_address = user_address | futex_key_private_flag,
}
};
}
m_futex_queues.clear();
auto* matching_region = address_space().find_region_containing(range);
if (!matching_region)
return EFAULT;
// The user wants to share this futex, but if the address doesn't point to a shared resource, there's not
// much sharing to be done, so let's mark this as private
if (!matching_region->is_shared()) {
return GlobalFutexKey {
.private_ = {
.address_space = &address_space(),
.user_address = user_address | futex_key_private_flag,
}
};
}
// This address is backed by a shared VMObject, if it's an AnonymousVMObject, it can be shared between processes
// via forking, and shared regions that are cloned during a fork retain their original AnonymousVMObject.
// On the other hand, if it's a SharedInodeVMObject, it can be shared by two processes mapping the same file as
// MAP_SHARED, but since they are deduplicated based on the inode, in all cases the VMObject pointer should be
// a unique global identifier.
// NOTE: This assumes that a program will not unmap the only region keeping the vmobject alive while waiting on it,
// if it does, it will get stuck waiting forever until interrupted by a signal, but since that use case is defined as
// a programmer error, we are fine with it.
auto const& vmobject = matching_region->vmobject();
if (vmobject.is_inode())
VERIFY(vmobject.is_shared_inode());
return GlobalFutexKey {
.shared = {
.vmobject = &vmobject,
.offset = matching_region->offset_in_vmobject_from_vaddr(range.base()) }
};
}
ErrorOr<FlatPtr> Process::sys$futex(Userspace<Syscall::SC_futex_params const*> user_params)
@ -35,6 +99,8 @@ ErrorOr<FlatPtr> Process::sys$futex(Userspace<Syscall::SC_futex_params const*> u
return ENOSYS;
}
bool shared = (params.futex_op & FUTEX_PRIVATE_FLAG) == 0;
switch (cmd) {
case FUTEX_WAIT:
case FUTEX_WAIT_BITSET:
@ -56,41 +122,44 @@ ErrorOr<FlatPtr> Process::sys$futex(Userspace<Syscall::SC_futex_params const*> u
}
}
auto find_futex_queue = [&](FlatPtr user_address, bool create_if_not_found, bool* did_create = nullptr) -> ErrorOr<RefPtr<FutexQueue>> {
auto find_futex_queue = [&](GlobalFutexKey futex_key, bool create_if_not_found, bool* did_create = nullptr) -> ErrorOr<RefPtr<FutexQueue>> {
VERIFY(!create_if_not_found || did_create != nullptr);
auto it = m_futex_queues.find(user_address);
if (it != m_futex_queues.end())
return it->value;
if (create_if_not_found) {
return s_global_futex_queues->with([&](auto& queues) -> ErrorOr<RefPtr<FutexQueue>> {
auto it = queues.find(futex_key);
if (it != queues.end())
return it->value;
if (!create_if_not_found)
return nullptr;
*did_create = true;
auto futex_queue = TRY(adopt_nonnull_ref_or_enomem(new (nothrow) FutexQueue));
auto result = TRY(m_futex_queues.try_set(user_address, futex_queue));
auto result = TRY(queues.try_set(futex_key, futex_queue));
VERIFY(result == AK::HashSetResult::InsertedNewEntry);
return futex_queue;
}
return nullptr;
});
};
auto remove_futex_queue = [&](FlatPtr user_address) {
if (auto it = m_futex_queues.find(user_address); it != m_futex_queues.end()) {
if (it->value->try_remove()) {
m_futex_queues.remove(it);
}
}
auto remove_futex_queue = [&](GlobalFutexKey futex_key) {
return s_global_futex_queues->with([&](auto& queues) {
auto it = queues.find(futex_key);
if (it == queues.end())
return;
if (it->value->try_remove())
queues.remove(it);
});
};
auto do_wake = [&](FlatPtr user_address, u32 count, Optional<u32> bitmask) -> ErrorOr<int> {
auto do_wake = [&](FlatPtr user_address, u32 count, Optional<u32> const& bitmask) -> ErrorOr<int> {
if (count == 0)
return 0;
SpinlockLocker locker(m_futex_lock);
auto futex_queue = TRY(find_futex_queue(user_address, false));
auto futex_key = TRY(get_futex_key(user_address, shared));
auto futex_queue = TRY(find_futex_queue(futex_key, false));
if (!futex_queue)
return 0;
bool is_empty;
u32 woke_count = futex_queue->wake_n(count, bitmask, is_empty);
if (is_empty) {
// If there are no more waiters, we want to get rid of the futex!
remove_futex_queue(user_address);
remove_futex_queue(futex_key);
}
return (int)woke_count;
};
@ -101,6 +170,7 @@ ErrorOr<FlatPtr> Process::sys$futex(Userspace<Syscall::SC_futex_params const*> u
auto do_wait = [&](u32 bitset) -> ErrorOr<FlatPtr> {
bool did_create;
RefPtr<FutexQueue> futex_queue;
auto futex_key = TRY(get_futex_key(user_address, shared));
do {
auto user_value = user_atomic_load_relaxed(params.userspace_address);
if (!user_value.has_value())
@ -111,9 +181,8 @@ ErrorOr<FlatPtr> Process::sys$futex(Userspace<Syscall::SC_futex_params const*> u
}
atomic_thread_fence(AK::MemoryOrder::memory_order_acquire);
SpinlockLocker locker(m_futex_lock);
did_create = false;
futex_queue = TRY(find_futex_queue(user_address, true, &did_create));
futex_queue = TRY(find_futex_queue(futex_key, true, &did_create));
VERIFY(futex_queue);
// We need to try again if we didn't create this queue and the existing queue
// was removed before we were able to queue an imminent wait.
@ -124,10 +193,9 @@ ErrorOr<FlatPtr> Process::sys$futex(Userspace<Syscall::SC_futex_params const*> u
Thread::BlockResult block_result = futex_queue->wait_on(timeout, bitset);
SpinlockLocker locker(m_futex_lock);
if (futex_queue->is_empty_and_no_imminent_waits()) {
// If there are no more waiters, we want to get rid of the futex!
remove_futex_queue(user_address);
remove_futex_queue(futex_key);
}
if (block_result == Thread::BlockResult::InterruptedByTimeout) {
return ETIMEDOUT;
@ -143,25 +211,28 @@ ErrorOr<FlatPtr> Process::sys$futex(Userspace<Syscall::SC_futex_params const*> u
return EAGAIN;
atomic_thread_fence(AK::MemoryOrder::memory_order_acquire);
int woken_or_requeued = 0;
SpinlockLocker locker(m_futex_lock);
if (auto futex_queue = TRY(find_futex_queue(user_address, false))) {
RefPtr<FutexQueue> target_futex_queue;
bool is_empty, is_target_empty;
woken_or_requeued = TRY(futex_queue->wake_n_requeue(
params.val, [&]() -> ErrorOr<FutexQueue*> {
// NOTE: futex_queue's lock is being held while this callback is called
// The reason we're doing this in a callback is that we don't want to always
// create a target queue, only if we actually have anything to move to it!
target_futex_queue = TRY(find_futex_queue(user_address2, true));
return target_futex_queue.ptr();
},
params.val2, is_empty, is_target_empty));
if (is_empty)
remove_futex_queue(user_address);
if (is_target_empty && target_futex_queue)
remove_futex_queue(user_address2);
}
auto futex_key = TRY(get_futex_key(user_address, shared));
auto futex_queue = TRY(find_futex_queue(futex_key, false));
if (!futex_queue)
return 0;
RefPtr<FutexQueue> target_futex_queue;
bool is_empty = false;
bool is_target_empty = false;
auto futex_key2 = TRY(get_futex_key(user_address2, shared));
auto woken_or_requeued = TRY(futex_queue->wake_n_requeue(
params.val, [&]() -> ErrorOr<FutexQueue*> {
// NOTE: futex_queue's lock is being held while this callback is called
// The reason we're doing this in a callback is that we don't want to always
// create a target queue, only if we actually have anything to move to it!
target_futex_queue = TRY(find_futex_queue(futex_key2, true));
return target_futex_queue.ptr();
},
params.val2, is_empty, is_target_empty));
if (is_empty)
remove_futex_queue(futex_key);
if (is_target_empty && target_futex_queue)
remove_futex_queue(futex_key2);
return woken_or_requeued;
};

View file

@ -655,7 +655,7 @@ static int rwlock_rdlock_maybe_timed(u32* lockp, const struct timespec* timeout
// Seems like someone is writing (or is interested in writing and we let them have the lock)
// wait until they're done.
auto rc = futex(lockp, FUTEX_WAIT_BITSET, current, timeout, nullptr, reader_wake_mask);
auto rc = futex(lockp, FUTEX_WAIT_BITSET | FUTEX_PRIVATE_FLAG, current, timeout, nullptr, reader_wake_mask);
if (rc < 0 && errno == ETIMEDOUT && timeout) {
return value_if_timeout;
}
@ -703,7 +703,7 @@ static int rwlock_wrlock_maybe_timed(pthread_rwlock_t* lockval_p, const struct t
// Seems like someone is writing (or is interested in writing and we let them have the lock)
// wait until they're done.
auto rc = futex(lockp, FUTEX_WAIT_BITSET, current, timeout, nullptr, writer_wake_mask);
auto rc = futex(lockp, FUTEX_WAIT_BITSET | FUTEX_PRIVATE_FLAG, current, timeout, nullptr, writer_wake_mask);
if (rc < 0 && errno == ETIMEDOUT && timeout) {
return value_if_timeout;
}
@ -794,7 +794,7 @@ int pthread_rwlock_unlock(pthread_rwlock_t* lockval_p)
auto desired = current & ~(writer_locked_mask | writer_intent_mask);
AK::atomic_store(lockp, desired, AK::MemoryOrder::memory_order_release);
// Then wake both readers and writers, if any.
auto rc = futex(lockp, FUTEX_WAKE_BITSET, current, nullptr, nullptr, (current & writer_wake_mask) | reader_wake_mask);
auto rc = futex(lockp, FUTEX_WAKE_BITSET | FUTEX_PRIVATE_FLAG, current, nullptr, nullptr, (current & writer_wake_mask) | reader_wake_mask);
if (rc < 0)
return errno;
return 0;

View file

@ -98,7 +98,7 @@ int pthread_cond_timedwait(pthread_cond_t* cond, pthread_mutex_t* mutex, const s
// value might change as soon as we unlock it.
u32 value = AK::atomic_fetch_or(&cond->value, NEED_TO_WAKE_ONE | NEED_TO_WAKE_ALL, AK::memory_order_release) | NEED_TO_WAKE_ONE | NEED_TO_WAKE_ALL;
pthread_mutex_unlock(mutex);
int rc = futex_wait(&cond->value, value, abstime, cond->clockid);
int rc = futex_wait(&cond->value, value, abstime, cond->clockid, false);
if (rc < 0 && errno != EAGAIN)
return errno;
@ -129,7 +129,7 @@ int pthread_cond_signal(pthread_cond_t* cond)
if (!(value & NEED_TO_WAKE_ONE)) [[likely]]
return 0;
// ...try to wake someone...
int rc = futex_wake(&cond->value, 1);
int rc = futex_wake(&cond->value, 1, false);
VERIFY(rc >= 0);
// ...and if we have woken someone, put the flag back.
if (rc > 0)
@ -152,7 +152,7 @@ int pthread_cond_broadcast(pthread_cond_t* cond)
pthread_mutex_t* mutex = AK::atomic_load(&cond->mutex, AK::memory_order_relaxed);
VERIFY(mutex);
int rc = futex(&cond->value, FUTEX_REQUEUE, 1, nullptr, &mutex->lock, INT_MAX);
int rc = futex(&cond->value, FUTEX_REQUEUE | FUTEX_PRIVATE_FLAG, 1, nullptr, &mutex->lock, INT_MAX);
VERIFY(rc >= 0);
return 0;
}

View file

@ -155,7 +155,7 @@ int pthread_mutex_lock(pthread_mutex_t* mutex)
value = AK::atomic_exchange(&mutex->lock, MUTEX_LOCKED_NEED_TO_WAKE, AK::memory_order_acquire);
while (value != MUTEX_UNLOCKED) {
futex_wait(&mutex->lock, value, nullptr, 0);
futex_wait(&mutex->lock, value, nullptr, 0, false);
value = AK::atomic_exchange(&mutex->lock, MUTEX_LOCKED_NEED_TO_WAKE, AK::memory_order_acquire);
}
@ -172,7 +172,7 @@ int __pthread_mutex_lock_pessimistic_np(pthread_mutex_t* mutex)
// because we know we don't. Used in the condition variable implementation.
u32 value = AK::atomic_exchange(&mutex->lock, MUTEX_LOCKED_NEED_TO_WAKE, AK::memory_order_acquire);
while (value != MUTEX_UNLOCKED) {
futex_wait(&mutex->lock, value, nullptr, 0);
futex_wait(&mutex->lock, value, nullptr, 0, false);
value = AK::atomic_exchange(&mutex->lock, MUTEX_LOCKED_NEED_TO_WAKE, AK::memory_order_acquire);
}
@ -195,7 +195,7 @@ int pthread_mutex_unlock(pthread_mutex_t* mutex)
u32 value = AK::atomic_exchange(&mutex->lock, MUTEX_UNLOCKED, AK::memory_order_release);
if (value == MUTEX_LOCKED_NEED_TO_WAKE) [[unlikely]] {
int rc = futex_wake(&mutex->lock, 1);
int rc = futex_wake(&mutex->lock, 1, false);
VERIFY(rc >= 0);
}

View file

@ -46,7 +46,7 @@ int pthread_once(pthread_once_t* self, void (*callback)(void))
// anyone.
break;
case State::PERFORMING_WITH_WAITERS:
futex_wake(self, INT_MAX);
futex_wake(self, INT_MAX, false);
break;
}
@ -76,7 +76,7 @@ int pthread_once(pthread_once_t* self, void (*callback)(void))
[[fallthrough]];
case State::PERFORMING_WITH_WAITERS:
// Let's wait for it.
futex_wait(self, state2, nullptr, 0);
futex_wait(self, state2, nullptr, 0, false);
// We have been woken up, but that might have been due to a signal
// or something, so we have to reevaluate. We need acquire ordering
// here for the same reason as above. Hopefully we'll just see

View file

@ -83,7 +83,7 @@ int sem_post(sem_t* sem)
// Check if another sem_post() call has handled it already.
if (!(value & POST_WAKES)) [[likely]]
return 0;
int rc = futex_wake(&sem->value, 1);
int rc = futex_wake(&sem->value, 1, false);
VERIFY(rc >= 0);
return 0;
}
@ -145,7 +145,7 @@ int sem_timedwait(sem_t* sem, const struct timespec* abstime)
// Re-evaluate.
continue;
if (going_to_wake) [[unlikely]] {
int rc = futex_wake(&sem->value, count - 1);
int rc = futex_wake(&sem->value, count - 1, false);
VERIFY(rc >= 0);
}
return 0;
@ -162,7 +162,7 @@ int sem_timedwait(sem_t* sem, const struct timespec* abstime)
}
// At this point, we're committed to sleeping.
responsible_for_waking = true;
futex_wait(&sem->value, value, abstime, CLOCK_REALTIME);
futex_wait(&sem->value, value, abstime, CLOCK_REALTIME, false);
// This is the state we will probably see upon being waked:
value = 1;
}

View file

@ -27,7 +27,7 @@ int futex(uint32_t* userspace_address, int futex_op, uint32_t value, const struc
# define ALWAYS_INLINE_SERENITY_H
#endif
static ALWAYS_INLINE int futex_wait(uint32_t* userspace_address, uint32_t value, const struct timespec* abstime, int clockid)
static ALWAYS_INLINE int futex_wait(uint32_t* userspace_address, uint32_t value, const struct timespec* abstime, int clockid, int process_shared)
{
int op;
@ -39,12 +39,12 @@ static ALWAYS_INLINE int futex_wait(uint32_t* userspace_address, uint32_t value,
} else {
op = FUTEX_WAIT;
}
return futex(userspace_address, op, value, abstime, NULL, FUTEX_BITSET_MATCH_ANY);
return futex(userspace_address, op | (process_shared ? 0 : FUTEX_PRIVATE_FLAG), value, abstime, NULL, FUTEX_BITSET_MATCH_ANY);
}
static ALWAYS_INLINE int futex_wake(uint32_t* userspace_address, uint32_t count)
static ALWAYS_INLINE int futex_wake(uint32_t* userspace_address, uint32_t count, int process_shared)
{
return futex(userspace_address, FUTEX_WAKE, count, NULL, NULL, 0);
return futex(userspace_address, FUTEX_WAKE | (process_shared ? 0 : FUTEX_PRIVATE_FLAG), count, NULL, NULL, 0);
}
#ifdef ALWAYS_INLINE_SERENITY_H