mirror of
https://gitlab.nic.cz/labs/bird.git
synced 2024-12-22 17:51:53 +00:00
Generalized the rte_src lockfree usecount algorithm
This commit is contained in:
parent
0bcbff42ca
commit
1df615991c
@ -10,6 +10,8 @@
|
|||||||
#ifndef _BIRD_BITMAP_H_
|
#ifndef _BIRD_BITMAP_H_
|
||||||
#define _BIRD_BITMAP_H_
|
#define _BIRD_BITMAP_H_
|
||||||
|
|
||||||
|
#include "lib/resource.h"
|
||||||
|
|
||||||
struct bmap
|
struct bmap
|
||||||
{
|
{
|
||||||
u32 size;
|
u32 size;
|
||||||
|
146
lib/lockfree.h
Normal file
146
lib/lockfree.h
Normal file
@ -0,0 +1,146 @@
|
|||||||
|
/*
|
||||||
|
* BIRD Library -- Generic lock-free structures
|
||||||
|
*
|
||||||
|
* (c) 2023 Maria Matejka <mq@jmq.cz>
|
||||||
|
* (c) 2023 CZ.NIC, z.s.p.o.
|
||||||
|
*
|
||||||
|
* Can be freely distributed and used under the terms of the GNU GPL.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _BIRD_LOCKFREE_H_
|
||||||
|
#define _BIRD_LOCKFREE_H_
|
||||||
|
|
||||||
|
#include "lib/event.h"
|
||||||
|
#include "lib/rcu.h"
|
||||||
|
|
||||||
|
#include <stdatomic.h>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lock-free usecounts.
|
||||||
|
*/
|
||||||
|
|
||||||
|
struct lfuc {
|
||||||
|
_Atomic u64 uc;
|
||||||
|
};
|
||||||
|
|
||||||
|
#define LFUC_PU_SHIFT 44
|
||||||
|
#define LFUC_IN_PROGRESS (1ULL << LFUC_PU_SHIFT)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* lfuc_lock - increase an atomic usecount
|
||||||
|
* @c: the usecount structure
|
||||||
|
*/
|
||||||
|
static inline void lfuc_lock(struct lfuc *c)
|
||||||
|
{
|
||||||
|
/* Locking is trivial; somebody already holds the underlying data structure
|
||||||
|
* so we just increase the use count. Nothing can be freed underneath our hands. */
|
||||||
|
u64 uc = atomic_fetch_add_explicit(&c->uc, 1, memory_order_acq_rel);
|
||||||
|
ASSERT_DIE(uc > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* lfuc_lock_revive - increase an atomic usecount even if it's zero
|
||||||
|
* @c: the usecount structure
|
||||||
|
*
|
||||||
|
* If the caller is sure that they can't collide with the prune routine,
|
||||||
|
* they can call this even on structures with already zeroed usecount.
|
||||||
|
* Handy for situations with flapping routes. Use only from the same
|
||||||
|
* loop as which runs the prune routine.
|
||||||
|
*/
|
||||||
|
static inline void lfuc_lock_revive(struct lfuc *c)
|
||||||
|
{
|
||||||
|
UNUSED u64 uc = atomic_fetch_add_explicit(&c->uc, 1, memory_order_acq_rel);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* lfuc_unlock - decrease an atomic usecount
|
||||||
|
* @c: the usecount structure
|
||||||
|
* @el: prune event list
|
||||||
|
* @ev: prune event itself
|
||||||
|
*
|
||||||
|
* If the usecount reaches zero, a prune event is run to possibly free the object.
|
||||||
|
* The prune event MUST use lfuc_finished() to check the object state.
|
||||||
|
*/
|
||||||
|
static inline void lfuc_unlock(struct lfuc *c, event_list *el, event *ev)
|
||||||
|
{
|
||||||
|
/* Unlocking is tricky. We do it lockless so at the same time, the prune
|
||||||
|
* event may be running, therefore if the unlock gets us to zero, it must be
|
||||||
|
* the last thing in this routine, otherwise the prune routine may find the
|
||||||
|
* source's usecount zeroed, freeing it prematurely.
|
||||||
|
*
|
||||||
|
* The usecount is split into two parts:
|
||||||
|
* the top 20 bits are an in-progress indicator
|
||||||
|
* the bottom 44 bits keep the actual usecount.
|
||||||
|
*
|
||||||
|
* Therefore at most 1 million of writers can simultaneously unlock the same
|
||||||
|
* structure, while at most ~17T different places can reference it. Both limits
|
||||||
|
* are insanely high from the 2022 point of view. Let's suppose that when 17T
|
||||||
|
* routes or 1M peers/tables get real, we get also 128bit atomic variables in the
|
||||||
|
* C norm. */
|
||||||
|
|
||||||
|
/* First, we push the in-progress indicator */
|
||||||
|
u64 uc = atomic_fetch_add_explicit(&c->uc, LFUC_IN_PROGRESS, memory_order_acq_rel);
|
||||||
|
|
||||||
|
/* Then we split the indicator to its parts. Remember, we got the value
|
||||||
|
* before the operation happened so we're re-doing the operation locally
|
||||||
|
* to get a view how the indicator _would_ look if nobody else was interacting.
|
||||||
|
*/
|
||||||
|
u64 pending = (uc >> LFUC_PU_SHIFT) + 1;
|
||||||
|
uc &= LFUC_IN_PROGRESS - 1;
|
||||||
|
|
||||||
|
/* We per-use the RCU critical section indicator to make the prune event wait
|
||||||
|
* until we finish here in the rare case we get preempted. */
|
||||||
|
rcu_read_lock();
|
||||||
|
|
||||||
|
/* Obviously, there can't be more pending unlocks than the usecount itself */
|
||||||
|
if (uc == pending)
|
||||||
|
/* If we're the last unlocker (every owner is already unlocking), schedule
|
||||||
|
* the owner's prune event */
|
||||||
|
ev_send(el, ev);
|
||||||
|
else
|
||||||
|
ASSERT_DIE(uc > pending);
|
||||||
|
|
||||||
|
/* And now, finally, simultaneously pop the in-progress indicator and the
|
||||||
|
* usecount, possibly allowing the pruning routine to free this structure */
|
||||||
|
atomic_fetch_sub_explicit(&c->uc, LFUC_IN_PROGRESS + 1, memory_order_acq_rel);
|
||||||
|
|
||||||
|
/* ... and to reduce the load a bit, the pruning routine will better wait for
|
||||||
|
* RCU synchronization instead of a busy loop. */
|
||||||
|
rcu_read_unlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* lfuc_finished - auxiliary routine for prune event
|
||||||
|
* @c: usecount structure
|
||||||
|
*
|
||||||
|
* This routine simply waits until all unlockers finish their job and leave
|
||||||
|
* the critical section of lfuc_unlock(). Then we decide whether the usecount
|
||||||
|
* is indeed zero or not, and therefore whether the structure is free to be freed.
|
||||||
|
*/
|
||||||
|
static inline _Bool
|
||||||
|
lfuc_finished(struct lfuc *c)
|
||||||
|
{
|
||||||
|
u64 uc;
|
||||||
|
/* Wait until all unlockers finish */
|
||||||
|
while ((uc = atomic_load_explicit(&c->uc, memory_order_acquire)) >> LFUC_PU_SHIFT)
|
||||||
|
synchronize_rcu();
|
||||||
|
|
||||||
|
/* All of them are now done and if the usecount is now zero, then we're
|
||||||
|
* the last place to reference the object and we can call it finished. */
|
||||||
|
return (uc == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* lfuc_init - auxiliary routine for usecount initialization
|
||||||
|
* @c: usecount structure
|
||||||
|
*
|
||||||
|
* Called on object initialization, sets the usecount to an initial one to make
|
||||||
|
* sure that the prune routine doesn't free it before somebody else references it.
|
||||||
|
*/
|
||||||
|
static inline void
|
||||||
|
lfuc_init(struct lfuc *c)
|
||||||
|
{
|
||||||
|
atomic_store_explicit(&c->uc, 1, memory_order_release);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
49
lib/route.h
49
lib/route.h
@ -16,6 +16,7 @@
|
|||||||
#include "lib/rcu.h"
|
#include "lib/rcu.h"
|
||||||
#include "lib/hash.h"
|
#include "lib/hash.h"
|
||||||
#include "lib/event.h"
|
#include "lib/event.h"
|
||||||
|
#include "lib/lockfree.h"
|
||||||
|
|
||||||
struct network;
|
struct network;
|
||||||
struct proto;
|
struct proto;
|
||||||
@ -67,7 +68,7 @@ struct rte_src {
|
|||||||
struct rte_owner *owner; /* Route source owner */
|
struct rte_owner *owner; /* Route source owner */
|
||||||
u64 private_id; /* Private ID, assigned by the protocol */
|
u64 private_id; /* Private ID, assigned by the protocol */
|
||||||
u32 global_id; /* Globally unique ID of the source */
|
u32 global_id; /* Globally unique ID of the source */
|
||||||
_Atomic u64 uc; /* Use count */
|
struct lfuc uc; /* Use count */
|
||||||
};
|
};
|
||||||
|
|
||||||
struct rte_owner_class {
|
struct rte_owner_class {
|
||||||
@ -111,54 +112,12 @@ struct rte_src *rt_find_source_global(u32 id);
|
|||||||
|
|
||||||
static inline void rt_lock_source(struct rte_src *src)
|
static inline void rt_lock_source(struct rte_src *src)
|
||||||
{
|
{
|
||||||
/* Locking a source is trivial; somebody already holds it so we just increase
|
lfuc_lock(&src->uc);
|
||||||
* the use count. Nothing can be freed underneath our hands. */
|
|
||||||
u64 uc = atomic_fetch_add_explicit(&src->uc, 1, memory_order_acq_rel);
|
|
||||||
ASSERT_DIE(uc > 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void rt_unlock_source(struct rte_src *src)
|
static inline void rt_unlock_source(struct rte_src *src)
|
||||||
{
|
{
|
||||||
/* Unlocking is tricky. We do it lockless so at the same time, the prune
|
lfuc_unlock(&src->uc, src->owner->list, src->owner->prune);
|
||||||
* event may be running, therefore if the unlock gets us to zero, it must be
|
|
||||||
* the last thing in this routine, otherwise the prune routine may find the
|
|
||||||
* source's usecount zeroed, freeing it prematurely.
|
|
||||||
*
|
|
||||||
* The usecount is split into two parts:
|
|
||||||
* the top 20 bits are an in-progress indicator
|
|
||||||
* the bottom 44 bits keep the actual usecount.
|
|
||||||
*
|
|
||||||
* Therefore at most 1 million of writers can simultaneously unlock the same
|
|
||||||
* source, while at most ~17T different routes can reference it. Both limits
|
|
||||||
* are insanely high from the 2022 point of view. Let's suppose that when 17T
|
|
||||||
* routes or 1M writers get real, we get also 128bit atomic variables in the
|
|
||||||
* C norm. */
|
|
||||||
|
|
||||||
/* First, we push the in-progress indicator */
|
|
||||||
u64 uc = atomic_fetch_add_explicit(&src->uc, RTE_SRC_IN_PROGRESS, memory_order_acq_rel);
|
|
||||||
|
|
||||||
/* Then we split the indicator to its parts. Remember, we got the value before the operation happened. */
|
|
||||||
u64 pending = (uc >> RTE_SRC_PU_SHIFT) + 1;
|
|
||||||
uc &= RTE_SRC_IN_PROGRESS - 1;
|
|
||||||
|
|
||||||
/* We per-use the RCU critical section indicator to make the prune event wait
|
|
||||||
* until we finish here in the rare case we get preempted. */
|
|
||||||
rcu_read_lock();
|
|
||||||
|
|
||||||
/* Obviously, there can't be more pending unlocks than the usecount itself */
|
|
||||||
if (uc == pending)
|
|
||||||
/* If we're the last unlocker, schedule the owner's prune event */
|
|
||||||
ev_send(src->owner->list, src->owner->prune);
|
|
||||||
else
|
|
||||||
ASSERT_DIE(uc > pending);
|
|
||||||
|
|
||||||
/* And now, finally, simultaneously pop the in-progress indicator and the
|
|
||||||
* usecount, possibly allowing the source pruning routine to free this structure */
|
|
||||||
atomic_fetch_sub_explicit(&src->uc, RTE_SRC_IN_PROGRESS + 1, memory_order_acq_rel);
|
|
||||||
|
|
||||||
/* ... and to reduce the load a bit, the source pruning routine will better wait for
|
|
||||||
* RCU synchronization instead of a busy loop. */
|
|
||||||
rcu_read_unlock();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef RT_SOURCE_DEBUG
|
#ifdef RT_SOURCE_DEBUG
|
||||||
|
@ -243,7 +243,7 @@ rt_get_source_o(struct rte_owner *p, u32 id)
|
|||||||
|
|
||||||
if (src)
|
if (src)
|
||||||
{
|
{
|
||||||
UNUSED u64 uc = atomic_fetch_add_explicit(&src->uc, 1, memory_order_acq_rel);
|
lfuc_lock_revive(&src->uc);
|
||||||
return src;
|
return src;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -253,7 +253,7 @@ rt_get_source_o(struct rte_owner *p, u32 id)
|
|||||||
src->private_id = id;
|
src->private_id = id;
|
||||||
src->global_id = idm_alloc(&src_ids);
|
src->global_id = idm_alloc(&src_ids);
|
||||||
|
|
||||||
atomic_store_explicit(&src->uc, 1, memory_order_release);
|
lfuc_init(&src->uc);
|
||||||
p->uc++;
|
p->uc++;
|
||||||
|
|
||||||
HASH_INSERT2(p->hash, RSH, rta_pool, src);
|
HASH_INSERT2(p->hash, RSH, rta_pool, src);
|
||||||
@ -330,11 +330,7 @@ rt_prune_sources(void *data)
|
|||||||
|
|
||||||
HASH_WALK_FILTER(o->hash, next, src, sp)
|
HASH_WALK_FILTER(o->hash, next, src, sp)
|
||||||
{
|
{
|
||||||
u64 uc;
|
if (lfuc_finished(&src->uc))
|
||||||
while ((uc = atomic_load_explicit(&src->uc, memory_order_acquire)) >> RTE_SRC_PU_SHIFT)
|
|
||||||
synchronize_rcu();
|
|
||||||
|
|
||||||
if (uc == 0)
|
|
||||||
{
|
{
|
||||||
o->uc--;
|
o->uc--;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user