From 9db10887a1499db41c1c549cf60a9c0ba27766c0 Mon Sep 17 00:00:00 2001 From: Idan Horowitz Date: Thu, 14 Jul 2022 01:25:35 +0300 Subject: [PATCH] Kernel: Clean up sys$futex and add support for cross-process futexes --- Kernel/API/POSIX/futex.h | 3 +- Kernel/Process.h | 30 +++- Kernel/Syscalls/futex.cpp | 165 +++++++++++++----- Userland/Libraries/LibC/pthread.cpp | 6 +- Userland/Libraries/LibC/pthread_cond.cpp | 6 +- .../Libraries/LibC/pthread_integration.cpp | 6 +- Userland/Libraries/LibC/pthread_once.cpp | 4 +- Userland/Libraries/LibC/semaphore.cpp | 6 +- Userland/Libraries/LibC/serenity.h | 8 +- 9 files changed, 164 insertions(+), 70 deletions(-) diff --git a/Kernel/API/POSIX/futex.h b/Kernel/API/POSIX/futex.h index d814e9910e9..fd90aafee95 100644 --- a/Kernel/API/POSIX/futex.h +++ b/Kernel/API/POSIX/futex.h @@ -53,7 +53,8 @@ extern "C" { #define FUTEX_WAKE_BITSET 10 #define FUTEX_CLOCK_REALTIME (1 << 8) -#define FUTEX_CMD_MASK ~(FUTEX_CLOCK_REALTIME) +#define FUTEX_PRIVATE_FLAG (1 << 9) +#define FUTEX_CMD_MASK ~(FUTEX_CLOCK_REALTIME | FUTEX_PRIVATE_FLAG) #define FUTEX_BITSET_MATCH_ANY 0xffffffff diff --git a/Kernel/Process.h b/Kernel/Process.h index 70d5650a924..e86c052edca 100644 --- a/Kernel/Process.h +++ b/Kernel/Process.h @@ -82,7 +82,22 @@ enum class VeilState { Locked, }; -using FutexQueues = HashMap>; +static constexpr FlatPtr futex_key_private_flag = 0b1; +union GlobalFutexKey { + struct { + Memory::VMObject const* vmobject; + FlatPtr offset; + } shared; + struct { + Memory::AddressSpace const* address_space; + FlatPtr user_address; + } private_; + struct { + FlatPtr parent; + FlatPtr offset; + } raw; +}; +static_assert(sizeof(GlobalFutexKey) == (sizeof(FlatPtr) * 2)); struct LoadResult; @@ -575,6 +590,8 @@ private: void clear_signal_handlers_for_exec(); void clear_futex_queues_on_exec(); + ErrorOr get_futex_key(FlatPtr user_address, bool shared); + ErrorOr remap_range_as_stack(FlatPtr address, size_t size); ErrorOr read_impl(int fd, Userspace buffer, size_t size); @@ -829,9 +846,6 @@ private: OwnPtr m_perf_event_buffer; - FutexQueues m_futex_queues; - Spinlock m_futex_lock; - // This member is used in the implementation of ptrace's PT_TRACEME flag. // If it is set to true, the process will stop at the next execve syscall // and wait for a tracer to attach. @@ -1040,3 +1054,11 @@ struct AK::Formatter : AK::Formatter { return AK::Formatter::format(builder, "{}({})"sv, value.name(), value.pid().value()); } }; + +namespace AK { +template<> +struct Traits : public GenericTraits { + static unsigned hash(Kernel::GlobalFutexKey const& futex_key) { return pair_int_hash(ptr_hash(futex_key.raw.parent), ptr_hash(futex_key.raw.offset)); } + static bool equals(Kernel::GlobalFutexKey const& a, Kernel::GlobalFutexKey const& b) { return a.raw.parent == b.raw.parent && a.raw.offset == b.raw.offset; } +}; +}; diff --git a/Kernel/Syscalls/futex.cpp b/Kernel/Syscalls/futex.cpp index b57e93dc15c..99b0256a86f 100644 --- a/Kernel/Syscalls/futex.cpp +++ b/Kernel/Syscalls/futex.cpp @@ -1,25 +1,89 @@ /* * Copyright (c) 2018-2021, Andreas Kling + * Copyright (c) 2022, Idan Horowitz * * SPDX-License-Identifier: BSD-2-Clause */ #include #include +#include #include #include namespace Kernel { +static Singleton>>> s_global_futex_queues; + void Process::clear_futex_queues_on_exec() { - SpinlockLocker lock(m_futex_lock); - for (auto& it : m_futex_queues) { - bool did_wake_all; - it.value->wake_all(did_wake_all); - VERIFY(did_wake_all); // No one should be left behind... + s_global_futex_queues->with([this](auto& queues) { + auto const* address_space = &this->address_space(); + queues.remove_all_matching([address_space](auto& futex_key, auto& futex_queue) { + if ((futex_key.raw.offset & futex_key_private_flag) == 0) + return false; + if (futex_key.private_.address_space != address_space) + return false; + bool did_wake_all; + futex_queue->wake_all(did_wake_all); + VERIFY(did_wake_all); // No one should be left behind... + return true; + }); + }); +} + +ErrorOr Process::get_futex_key(FlatPtr user_address, bool shared) +{ + if (user_address & 0b11) // user_address points to a u32, so must be 4byte aligned + return EINVAL; + + auto range = Memory::VirtualRange { VirtualAddress(user_address), sizeof(u32) }; + + if (!Kernel::Memory::is_user_range(range)) + return EFAULT; + + if (!shared) { // If this is thread-shared, we can skip searching the matching region + return GlobalFutexKey { + .private_ = { + .address_space = &address_space(), + .user_address = user_address | futex_key_private_flag, + } + }; } - m_futex_queues.clear(); + + auto* matching_region = address_space().find_region_containing(range); + if (!matching_region) + return EFAULT; + + // The user wants to share this futex, but if the address doesn't point to a shared resource, there's not + // much sharing to be done, so let's mark this as private + if (!matching_region->is_shared()) { + return GlobalFutexKey { + .private_ = { + .address_space = &address_space(), + .user_address = user_address | futex_key_private_flag, + } + }; + } + + // This address is backed by a shared VMObject, if it's an AnonymousVMObject, it can be shared between processes + // via forking, and shared regions that are cloned during a fork retain their original AnonymousVMObject. + // On the other hand, if it's a SharedInodeVMObject, it can be shared by two processes mapping the same file as + // MAP_SHARED, but since they are deduplicated based on the inode, in all cases the VMObject pointer should be + // a unique global identifier. + // NOTE: This assumes that a program will not unmap the only region keeping the vmobject alive while waiting on it, + // if it does, it will get stuck waiting forever until interrupted by a signal, but since that use case is defined as + // a programmer error, we are fine with it. + + auto const& vmobject = matching_region->vmobject(); + if (vmobject.is_inode()) + VERIFY(vmobject.is_shared_inode()); + + return GlobalFutexKey { + .shared = { + .vmobject = &vmobject, + .offset = matching_region->offset_in_vmobject_from_vaddr(range.base()) } + }; } ErrorOr Process::sys$futex(Userspace user_params) @@ -35,6 +99,8 @@ ErrorOr Process::sys$futex(Userspace u return ENOSYS; } + bool shared = (params.futex_op & FUTEX_PRIVATE_FLAG) == 0; + switch (cmd) { case FUTEX_WAIT: case FUTEX_WAIT_BITSET: @@ -56,41 +122,44 @@ ErrorOr Process::sys$futex(Userspace u } } - auto find_futex_queue = [&](FlatPtr user_address, bool create_if_not_found, bool* did_create = nullptr) -> ErrorOr> { + auto find_futex_queue = [&](GlobalFutexKey futex_key, bool create_if_not_found, bool* did_create = nullptr) -> ErrorOr> { VERIFY(!create_if_not_found || did_create != nullptr); - auto it = m_futex_queues.find(user_address); - if (it != m_futex_queues.end()) - return it->value; - if (create_if_not_found) { + return s_global_futex_queues->with([&](auto& queues) -> ErrorOr> { + auto it = queues.find(futex_key); + if (it != queues.end()) + return it->value; + if (!create_if_not_found) + return nullptr; *did_create = true; auto futex_queue = TRY(adopt_nonnull_ref_or_enomem(new (nothrow) FutexQueue)); - auto result = TRY(m_futex_queues.try_set(user_address, futex_queue)); + auto result = TRY(queues.try_set(futex_key, futex_queue)); VERIFY(result == AK::HashSetResult::InsertedNewEntry); return futex_queue; - } - return nullptr; + }); }; - auto remove_futex_queue = [&](FlatPtr user_address) { - if (auto it = m_futex_queues.find(user_address); it != m_futex_queues.end()) { - if (it->value->try_remove()) { - m_futex_queues.remove(it); - } - } + auto remove_futex_queue = [&](GlobalFutexKey futex_key) { + return s_global_futex_queues->with([&](auto& queues) { + auto it = queues.find(futex_key); + if (it == queues.end()) + return; + if (it->value->try_remove()) + queues.remove(it); + }); }; - auto do_wake = [&](FlatPtr user_address, u32 count, Optional bitmask) -> ErrorOr { + auto do_wake = [&](FlatPtr user_address, u32 count, Optional const& bitmask) -> ErrorOr { if (count == 0) return 0; - SpinlockLocker locker(m_futex_lock); - auto futex_queue = TRY(find_futex_queue(user_address, false)); + auto futex_key = TRY(get_futex_key(user_address, shared)); + auto futex_queue = TRY(find_futex_queue(futex_key, false)); if (!futex_queue) return 0; bool is_empty; u32 woke_count = futex_queue->wake_n(count, bitmask, is_empty); if (is_empty) { // If there are no more waiters, we want to get rid of the futex! - remove_futex_queue(user_address); + remove_futex_queue(futex_key); } return (int)woke_count; }; @@ -101,6 +170,7 @@ ErrorOr Process::sys$futex(Userspace u auto do_wait = [&](u32 bitset) -> ErrorOr { bool did_create; RefPtr futex_queue; + auto futex_key = TRY(get_futex_key(user_address, shared)); do { auto user_value = user_atomic_load_relaxed(params.userspace_address); if (!user_value.has_value()) @@ -111,9 +181,8 @@ ErrorOr Process::sys$futex(Userspace u } atomic_thread_fence(AK::MemoryOrder::memory_order_acquire); - SpinlockLocker locker(m_futex_lock); did_create = false; - futex_queue = TRY(find_futex_queue(user_address, true, &did_create)); + futex_queue = TRY(find_futex_queue(futex_key, true, &did_create)); VERIFY(futex_queue); // We need to try again if we didn't create this queue and the existing queue // was removed before we were able to queue an imminent wait. @@ -124,10 +193,9 @@ ErrorOr Process::sys$futex(Userspace u Thread::BlockResult block_result = futex_queue->wait_on(timeout, bitset); - SpinlockLocker locker(m_futex_lock); if (futex_queue->is_empty_and_no_imminent_waits()) { // If there are no more waiters, we want to get rid of the futex! - remove_futex_queue(user_address); + remove_futex_queue(futex_key); } if (block_result == Thread::BlockResult::InterruptedByTimeout) { return ETIMEDOUT; @@ -143,25 +211,28 @@ ErrorOr Process::sys$futex(Userspace u return EAGAIN; atomic_thread_fence(AK::MemoryOrder::memory_order_acquire); - int woken_or_requeued = 0; - SpinlockLocker locker(m_futex_lock); - if (auto futex_queue = TRY(find_futex_queue(user_address, false))) { - RefPtr target_futex_queue; - bool is_empty, is_target_empty; - woken_or_requeued = TRY(futex_queue->wake_n_requeue( - params.val, [&]() -> ErrorOr { - // NOTE: futex_queue's lock is being held while this callback is called - // The reason we're doing this in a callback is that we don't want to always - // create a target queue, only if we actually have anything to move to it! - target_futex_queue = TRY(find_futex_queue(user_address2, true)); - return target_futex_queue.ptr(); - }, - params.val2, is_empty, is_target_empty)); - if (is_empty) - remove_futex_queue(user_address); - if (is_target_empty && target_futex_queue) - remove_futex_queue(user_address2); - } + auto futex_key = TRY(get_futex_key(user_address, shared)); + auto futex_queue = TRY(find_futex_queue(futex_key, false)); + if (!futex_queue) + return 0; + + RefPtr target_futex_queue; + bool is_empty = false; + bool is_target_empty = false; + auto futex_key2 = TRY(get_futex_key(user_address2, shared)); + auto woken_or_requeued = TRY(futex_queue->wake_n_requeue( + params.val, [&]() -> ErrorOr { + // NOTE: futex_queue's lock is being held while this callback is called + // The reason we're doing this in a callback is that we don't want to always + // create a target queue, only if we actually have anything to move to it! + target_futex_queue = TRY(find_futex_queue(futex_key2, true)); + return target_futex_queue.ptr(); + }, + params.val2, is_empty, is_target_empty)); + if (is_empty) + remove_futex_queue(futex_key); + if (is_target_empty && target_futex_queue) + remove_futex_queue(futex_key2); return woken_or_requeued; }; diff --git a/Userland/Libraries/LibC/pthread.cpp b/Userland/Libraries/LibC/pthread.cpp index 7dd5771ad83..bab67312d09 100644 --- a/Userland/Libraries/LibC/pthread.cpp +++ b/Userland/Libraries/LibC/pthread.cpp @@ -655,7 +655,7 @@ static int rwlock_rdlock_maybe_timed(u32* lockp, const struct timespec* timeout // Seems like someone is writing (or is interested in writing and we let them have the lock) // wait until they're done. - auto rc = futex(lockp, FUTEX_WAIT_BITSET, current, timeout, nullptr, reader_wake_mask); + auto rc = futex(lockp, FUTEX_WAIT_BITSET | FUTEX_PRIVATE_FLAG, current, timeout, nullptr, reader_wake_mask); if (rc < 0 && errno == ETIMEDOUT && timeout) { return value_if_timeout; } @@ -703,7 +703,7 @@ static int rwlock_wrlock_maybe_timed(pthread_rwlock_t* lockval_p, const struct t // Seems like someone is writing (or is interested in writing and we let them have the lock) // wait until they're done. - auto rc = futex(lockp, FUTEX_WAIT_BITSET, current, timeout, nullptr, writer_wake_mask); + auto rc = futex(lockp, FUTEX_WAIT_BITSET | FUTEX_PRIVATE_FLAG, current, timeout, nullptr, writer_wake_mask); if (rc < 0 && errno == ETIMEDOUT && timeout) { return value_if_timeout; } @@ -794,7 +794,7 @@ int pthread_rwlock_unlock(pthread_rwlock_t* lockval_p) auto desired = current & ~(writer_locked_mask | writer_intent_mask); AK::atomic_store(lockp, desired, AK::MemoryOrder::memory_order_release); // Then wake both readers and writers, if any. - auto rc = futex(lockp, FUTEX_WAKE_BITSET, current, nullptr, nullptr, (current & writer_wake_mask) | reader_wake_mask); + auto rc = futex(lockp, FUTEX_WAKE_BITSET | FUTEX_PRIVATE_FLAG, current, nullptr, nullptr, (current & writer_wake_mask) | reader_wake_mask); if (rc < 0) return errno; return 0; diff --git a/Userland/Libraries/LibC/pthread_cond.cpp b/Userland/Libraries/LibC/pthread_cond.cpp index 5ba15c368e9..0ff377bfac5 100644 --- a/Userland/Libraries/LibC/pthread_cond.cpp +++ b/Userland/Libraries/LibC/pthread_cond.cpp @@ -98,7 +98,7 @@ int pthread_cond_timedwait(pthread_cond_t* cond, pthread_mutex_t* mutex, const s // value might change as soon as we unlock it. u32 value = AK::atomic_fetch_or(&cond->value, NEED_TO_WAKE_ONE | NEED_TO_WAKE_ALL, AK::memory_order_release) | NEED_TO_WAKE_ONE | NEED_TO_WAKE_ALL; pthread_mutex_unlock(mutex); - int rc = futex_wait(&cond->value, value, abstime, cond->clockid); + int rc = futex_wait(&cond->value, value, abstime, cond->clockid, false); if (rc < 0 && errno != EAGAIN) return errno; @@ -129,7 +129,7 @@ int pthread_cond_signal(pthread_cond_t* cond) if (!(value & NEED_TO_WAKE_ONE)) [[likely]] return 0; // ...try to wake someone... - int rc = futex_wake(&cond->value, 1); + int rc = futex_wake(&cond->value, 1, false); VERIFY(rc >= 0); // ...and if we have woken someone, put the flag back. if (rc > 0) @@ -152,7 +152,7 @@ int pthread_cond_broadcast(pthread_cond_t* cond) pthread_mutex_t* mutex = AK::atomic_load(&cond->mutex, AK::memory_order_relaxed); VERIFY(mutex); - int rc = futex(&cond->value, FUTEX_REQUEUE, 1, nullptr, &mutex->lock, INT_MAX); + int rc = futex(&cond->value, FUTEX_REQUEUE | FUTEX_PRIVATE_FLAG, 1, nullptr, &mutex->lock, INT_MAX); VERIFY(rc >= 0); return 0; } diff --git a/Userland/Libraries/LibC/pthread_integration.cpp b/Userland/Libraries/LibC/pthread_integration.cpp index 6434339c3cf..94eff866f0d 100644 --- a/Userland/Libraries/LibC/pthread_integration.cpp +++ b/Userland/Libraries/LibC/pthread_integration.cpp @@ -155,7 +155,7 @@ int pthread_mutex_lock(pthread_mutex_t* mutex) value = AK::atomic_exchange(&mutex->lock, MUTEX_LOCKED_NEED_TO_WAKE, AK::memory_order_acquire); while (value != MUTEX_UNLOCKED) { - futex_wait(&mutex->lock, value, nullptr, 0); + futex_wait(&mutex->lock, value, nullptr, 0, false); value = AK::atomic_exchange(&mutex->lock, MUTEX_LOCKED_NEED_TO_WAKE, AK::memory_order_acquire); } @@ -172,7 +172,7 @@ int __pthread_mutex_lock_pessimistic_np(pthread_mutex_t* mutex) // because we know we don't. Used in the condition variable implementation. u32 value = AK::atomic_exchange(&mutex->lock, MUTEX_LOCKED_NEED_TO_WAKE, AK::memory_order_acquire); while (value != MUTEX_UNLOCKED) { - futex_wait(&mutex->lock, value, nullptr, 0); + futex_wait(&mutex->lock, value, nullptr, 0, false); value = AK::atomic_exchange(&mutex->lock, MUTEX_LOCKED_NEED_TO_WAKE, AK::memory_order_acquire); } @@ -195,7 +195,7 @@ int pthread_mutex_unlock(pthread_mutex_t* mutex) u32 value = AK::atomic_exchange(&mutex->lock, MUTEX_UNLOCKED, AK::memory_order_release); if (value == MUTEX_LOCKED_NEED_TO_WAKE) [[unlikely]] { - int rc = futex_wake(&mutex->lock, 1); + int rc = futex_wake(&mutex->lock, 1, false); VERIFY(rc >= 0); } diff --git a/Userland/Libraries/LibC/pthread_once.cpp b/Userland/Libraries/LibC/pthread_once.cpp index ab435d46398..f13f4eaa7a1 100644 --- a/Userland/Libraries/LibC/pthread_once.cpp +++ b/Userland/Libraries/LibC/pthread_once.cpp @@ -46,7 +46,7 @@ int pthread_once(pthread_once_t* self, void (*callback)(void)) // anyone. break; case State::PERFORMING_WITH_WAITERS: - futex_wake(self, INT_MAX); + futex_wake(self, INT_MAX, false); break; } @@ -76,7 +76,7 @@ int pthread_once(pthread_once_t* self, void (*callback)(void)) [[fallthrough]]; case State::PERFORMING_WITH_WAITERS: // Let's wait for it. - futex_wait(self, state2, nullptr, 0); + futex_wait(self, state2, nullptr, 0, false); // We have been woken up, but that might have been due to a signal // or something, so we have to reevaluate. We need acquire ordering // here for the same reason as above. Hopefully we'll just see diff --git a/Userland/Libraries/LibC/semaphore.cpp b/Userland/Libraries/LibC/semaphore.cpp index d60d43fc1e8..a3546459cd5 100644 --- a/Userland/Libraries/LibC/semaphore.cpp +++ b/Userland/Libraries/LibC/semaphore.cpp @@ -83,7 +83,7 @@ int sem_post(sem_t* sem) // Check if another sem_post() call has handled it already. if (!(value & POST_WAKES)) [[likely]] return 0; - int rc = futex_wake(&sem->value, 1); + int rc = futex_wake(&sem->value, 1, false); VERIFY(rc >= 0); return 0; } @@ -145,7 +145,7 @@ int sem_timedwait(sem_t* sem, const struct timespec* abstime) // Re-evaluate. continue; if (going_to_wake) [[unlikely]] { - int rc = futex_wake(&sem->value, count - 1); + int rc = futex_wake(&sem->value, count - 1, false); VERIFY(rc >= 0); } return 0; @@ -162,7 +162,7 @@ int sem_timedwait(sem_t* sem, const struct timespec* abstime) } // At this point, we're committed to sleeping. responsible_for_waking = true; - futex_wait(&sem->value, value, abstime, CLOCK_REALTIME); + futex_wait(&sem->value, value, abstime, CLOCK_REALTIME, false); // This is the state we will probably see upon being waked: value = 1; } diff --git a/Userland/Libraries/LibC/serenity.h b/Userland/Libraries/LibC/serenity.h index 6ab5538743b..35cb299e677 100644 --- a/Userland/Libraries/LibC/serenity.h +++ b/Userland/Libraries/LibC/serenity.h @@ -27,7 +27,7 @@ int futex(uint32_t* userspace_address, int futex_op, uint32_t value, const struc # define ALWAYS_INLINE_SERENITY_H #endif -static ALWAYS_INLINE int futex_wait(uint32_t* userspace_address, uint32_t value, const struct timespec* abstime, int clockid) +static ALWAYS_INLINE int futex_wait(uint32_t* userspace_address, uint32_t value, const struct timespec* abstime, int clockid, int process_shared) { int op; @@ -39,12 +39,12 @@ static ALWAYS_INLINE int futex_wait(uint32_t* userspace_address, uint32_t value, } else { op = FUTEX_WAIT; } - return futex(userspace_address, op, value, abstime, NULL, FUTEX_BITSET_MATCH_ANY); + return futex(userspace_address, op | (process_shared ? 0 : FUTEX_PRIVATE_FLAG), value, abstime, NULL, FUTEX_BITSET_MATCH_ANY); } -static ALWAYS_INLINE int futex_wake(uint32_t* userspace_address, uint32_t count) +static ALWAYS_INLINE int futex_wake(uint32_t* userspace_address, uint32_t count, int process_shared) { - return futex(userspace_address, FUTEX_WAKE, count, NULL, NULL, 0); + return futex(userspace_address, FUTEX_WAKE | (process_shared ? 0 : FUTEX_PRIVATE_FLAG), count, NULL, NULL, 0); } #ifdef ALWAYS_INLINE_SERENITY_H