From b5061659d3cc011118024861c2f048e67affbd39 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Thu, 4 Feb 2021 15:08:52 +0100 Subject: [PATCH 01/59] POSIX threads and thread-local storage is needed for concurrent execution --- aclocal.m4 | 14 ++++++++++++-- configure.ac | 46 +++++++++++++++++++--------------------------- lib/birdlib.h | 4 ---- lib/timer.c | 10 ---------- sysdep/unix/log.c | 11 ----------- 5 files changed, 31 insertions(+), 54 deletions(-) diff --git a/aclocal.m4 b/aclocal.m4 index 1613d680..3405b85b 100644 --- a/aclocal.m4 +++ b/aclocal.m4 @@ -1,5 +1,6 @@ dnl ** Additional Autoconf tests for BIRD configure script dnl ** (c) 1999 Martin Mares +dnl ** (c) 2021 Maria Matejka AC_DEFUN([BIRD_CHECK_THREAD_LOCAL], [ @@ -9,14 +10,23 @@ AC_DEFUN([BIRD_CHECK_THREAD_LOCAL], AC_COMPILE_IFELSE([ AC_LANG_PROGRAM( [ - _Thread_local static int x = 42; + static _Thread_local int x = 42; ], [] ) ], [bird_cv_thread_local=yes], + [AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM( + [ + static __thread int x = 42; + ], + [] + ) + ], + [bird_cv_thread_local=__thread], [bird_cv_thread_local=no] - ) + )]) ) ]) diff --git a/configure.ac b/configure.ac index 64181d29..5c0cf002 100644 --- a/configure.ac +++ b/configure.ac @@ -36,12 +36,6 @@ AC_ARG_ENABLE([memcheck], [enable_memcheck=yes] ) -AC_ARG_ENABLE([pthreads], - [AS_HELP_STRING([--enable-pthreads], [enable POSIX threads support @<:@try@:>@])], - [], - [enable_pthreads=try] -) - AC_ARG_ENABLE([libssh], [AS_HELP_STRING([--enable-libssh], [enable LibSSH support in RPKI @<:@try@:>@])], [], @@ -125,25 +119,19 @@ if test -z "$GCC" ; then fi BIRD_CHECK_THREAD_LOCAL -if test "$bird_cv_thread_local" = yes ; then - AC_DEFINE([HAVE_THREAD_LOCAL], [1], [Define to 1 if _Thread_local is available]) +if test "$bird_cv_thread_local" = no ; then + AC_MSG_ERROR([This program requires thread local storage.]) +elif test "$bird_cv_thread_local" != yes ; then + AC_DEFINE_UNQUOTED([_Thread_local], [$bird_cv_thread_local], [Legacy _Thread_local]) fi -if test "$enable_pthreads" != no ; then - BIRD_CHECK_PTHREADS +BIRD_CHECK_PTHREADS - if test "$bird_cv_lib_pthreads" = yes ; then - AC_DEFINE([USE_PTHREADS], [1], [Define to 1 if pthreads are enabled]) - CFLAGS="$CFLAGS -pthread" - LDFLAGS="$LDFLAGS -pthread" - proto_bfd=bfd - elif test "$enable_pthreads" = yes ; then - AC_MSG_ERROR([POSIX threads not available.]) - fi - - if test "$enable_pthreads" = try ; then - enable_pthreads="$bird_cv_lib_pthreads" - fi +if test "$bird_cv_lib_pthreads" = yes ; then + CFLAGS="$CFLAGS -pthread" + LDFLAGS="$LDFLAGS -pthread" +else + AC_MSG_ERROR([POSIX threads not available.]) fi # This is assumed to be necessary for proper BIRD build @@ -304,8 +292,7 @@ if test "$enable_mpls_kernel" != no ; then fi fi -all_protocols="$proto_bfd babel bgp mrt ospf perf pipe radv rip rpki static" - +all_protocols="bfd babel bgp mrt ospf perf pipe radv rip rpki static" all_protocols=`echo $all_protocols | sed 's/ /,/g'` if test "$with_protocols" = all ; then @@ -351,9 +338,15 @@ case $sysdesc in esac AC_CHECK_HEADERS_ONCE([alloca.h syslog.h]) -AC_CHECK_HEADER([sys/mman.h], [AC_DEFINE([HAVE_MMAP], [1], [Define to 1 if mmap() is available.])]) +AC_CHECK_HEADER([sys/mman.h], [AC_DEFINE([HAVE_MMAP], [1], [Define to 1 if mmap() is available.])], have_mman=no) +AC_CHECK_FUNC([aligned_alloc], [AC_DEFINE([HAVE_ALIGNED_ALLOC], [1], [Define to 1 if aligned_alloc() is available.])], have_aligned_alloc=no) AC_CHECK_MEMBERS([struct sockaddr.sa_len], [], [], [#include ]) +if test "$have_aligned_alloc" = "no" && test "$have_mman" = "no" ; then + AC_MSG_ERROR([No means of aligned alloc found. Need mmap() or aligned_alloc().]) +fi + + AC_C_BIGENDIAN( [AC_DEFINE([CPU_BIG_ENDIAN], [1], [Define to 1 if cpu is big endian])], [AC_DEFINE([CPU_LITTLE_ENDIAN], [1], [Define to 1 if cpu is little endian])], @@ -402,7 +395,7 @@ if test "$enable_debug" = yes ; then fi fi - if test "enable_debug_expensive" = yes ; then + if test "$enable_debug_expensive" = yes ; then AC_DEFINE([ENABLE_EXPENSIVE_CHECKS], [1], [Define to 1 if you want to run expensive consistency checks.]) fi fi @@ -467,7 +460,6 @@ AC_MSG_RESULT([ Object directory: $objdir]) AC_MSG_RESULT([ Iproute2 directory: $iproutedir]) AC_MSG_RESULT([ System configuration: $sysdesc]) AC_MSG_RESULT([ Debugging: $enable_debug]) -AC_MSG_RESULT([ POSIX threads: $enable_pthreads]) AC_MSG_RESULT([ Routing protocols: $protocols]) AC_MSG_RESULT([ LibSSH support in RPKI: $enable_libssh]) AC_MSG_RESULT([ Kernel MPLS support: $enable_mpls_kernel]) diff --git a/lib/birdlib.h b/lib/birdlib.h index 2e642d38..dc8bd00f 100644 --- a/lib/birdlib.h +++ b/lib/birdlib.h @@ -77,10 +77,6 @@ static inline int u64_cmp(u64 i1, u64 i2) #define STATIC_ASSERT(EXP) _Static_assert(EXP, #EXP) #define STATIC_ASSERT_MSG(EXP,MSG) _Static_assert(EXP, MSG) -#ifndef HAVE_THREAD_LOCAL -#define _Thread_local -#endif - /* Microsecond time */ typedef s64 btime; diff --git a/lib/timer.c b/lib/timer.c index 381163d0..f978a0f3 100644 --- a/lib/timer.c +++ b/lib/timer.c @@ -40,8 +40,6 @@ struct timeloop main_timeloop; -#ifdef USE_PTHREADS - #include /* Data accessed and modified from proto/bfd/io.c */ @@ -62,14 +60,6 @@ timeloop_init_current(void) void wakeup_kick_current(void); -#else - -/* Just use main timelooop */ -static inline struct timeloop * timeloop_current(void) { return &main_timeloop; } -static inline void timeloop_init_current(void) { } - -#endif - btime current_time(void) { diff --git a/sysdep/unix/log.c b/sysdep/unix/log.c index 4e9df069..a23903b7 100644 --- a/sysdep/unix/log.c +++ b/sysdep/unix/log.c @@ -36,8 +36,6 @@ static list *current_log_list; static char *current_syslog_name; /* NULL -> syslog closed */ -#ifdef USE_PTHREADS - #include static pthread_mutex_t log_mutex; @@ -48,15 +46,6 @@ static pthread_t main_thread; void main_thread_init(void) { main_thread = pthread_self(); } static int main_thread_self(void) { return pthread_equal(pthread_self(), main_thread); } -#else - -static inline void log_lock(void) { } -static inline void log_unlock(void) { } -void main_thread_init(void) { } -static int main_thread_self(void) { return 1; } - -#endif - #ifdef HAVE_SYSLOG_H #include From feb17ced234bad13ae64b52a3f86241f74517997 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Fri, 18 Jun 2021 18:10:42 +0200 Subject: [PATCH 02/59] Dropping the POSIX thread-local variables in favor of much easier-to-use C11 thread-local variables --- lib/timer.c | 47 ++++++++++++++------------------------------ lib/timer.h | 1 + proto/bfd/io.c | 53 +++++++++++++------------------------------------- 3 files changed, 29 insertions(+), 72 deletions(-) diff --git a/lib/timer.c b/lib/timer.c index f978a0f3..6efcadb4 100644 --- a/lib/timer.c +++ b/lib/timer.c @@ -43,38 +43,23 @@ struct timeloop main_timeloop; #include /* Data accessed and modified from proto/bfd/io.c */ -pthread_key_t current_time_key; - -static inline struct timeloop * -timeloop_current(void) -{ - return pthread_getspecific(current_time_key); -} - -static inline void -timeloop_init_current(void) -{ - pthread_key_create(¤t_time_key, NULL); - pthread_setspecific(current_time_key, &main_timeloop); -} +_Thread_local struct timeloop *local_timeloop; void wakeup_kick_current(void); btime current_time(void) { - return timeloop_current()->last_time; + return local_timeloop->last_time; } btime current_real_time(void) { - struct timeloop *loop = timeloop_current(); + if (!local_timeloop->real_time) + times_update_real_time(local_timeloop); - if (!loop->real_time) - times_update_real_time(loop); - - return loop->real_time; + return local_timeloop->real_time; } @@ -128,30 +113,29 @@ tm_new(pool *p) void tm_set(timer *t, btime when) { - struct timeloop *loop = timeloop_current(); - uint tc = timers_count(loop); + uint tc = timers_count(local_timeloop); if (!t->expires) { t->index = ++tc; t->expires = when; - BUFFER_PUSH(loop->timers) = t; - HEAP_INSERT(loop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP); + BUFFER_PUSH(local_timeloop->timers) = t; + HEAP_INSERT(local_timeloop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP); } else if (t->expires < when) { t->expires = when; - HEAP_INCREASE(loop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); + HEAP_INCREASE(local_timeloop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); } else if (t->expires > when) { t->expires = when; - HEAP_DECREASE(loop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); + HEAP_DECREASE(local_timeloop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); } #ifdef CONFIG_BFD /* Hack to notify BFD loops */ - if ((loop != &main_timeloop) && (t->index == 1)) + if ((local_timeloop != &main_timeloop) && (t->index == 1)) wakeup_kick_current(); #endif } @@ -168,11 +152,10 @@ tm_stop(timer *t) if (!t->expires) return; - struct timeloop *loop = timeloop_current(); - uint tc = timers_count(loop); + uint tc = timers_count(local_timeloop); - HEAP_DELETE(loop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); - BUFFER_POP(loop->timers); + HEAP_DELETE(local_timeloop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); + BUFFER_POP(local_timeloop->timers); t->index = -1; t->expires = 0; @@ -230,7 +213,7 @@ void timer_init(void) { timers_init(&main_timeloop, &root_pool); - timeloop_init_current(); + local_timeloop = &main_timeloop; } diff --git a/lib/timer.h b/lib/timer.h index c5ea430c..bc568ee6 100644 --- a/lib/timer.h +++ b/lib/timer.h @@ -42,6 +42,7 @@ static inline timer *timers_first(struct timeloop *loop) { return (loop->timers.used > 1) ? loop->timers.data[1] : NULL; } extern struct timeloop main_timeloop; +extern _Thread_local struct timeloop *local_timeloop; btime current_time(void); btime current_real_time(void); diff --git a/proto/bfd/io.c b/proto/bfd/io.c index 1cd9365a..8fdc84fb 100644 --- a/proto/bfd/io.c +++ b/proto/bfd/io.c @@ -52,29 +52,15 @@ struct birdloop * Current thread context */ -static pthread_key_t current_loop_key; -extern pthread_key_t current_time_key; - -static inline struct birdloop * -birdloop_current(void) -{ - return pthread_getspecific(current_loop_key); -} +static _Thread_local struct birdloop *birdloop_current; static inline void birdloop_set_current(struct birdloop *loop) { - pthread_setspecific(current_loop_key, loop); - pthread_setspecific(current_time_key, loop ? &loop->time : &main_timeloop); + birdloop_current = loop; + local_timeloop = loop ? &loop->time : &main_timeloop; } -static inline void -birdloop_init_current(void) -{ - pthread_key_create(¤t_loop_key, NULL); -} - - /* * Wakeup code for birdloop */ @@ -162,10 +148,8 @@ wakeup_kick(struct birdloop *loop) void wakeup_kick_current(void) { - struct birdloop *loop = birdloop_current(); - - if (loop && loop->poll_active) - wakeup_kick(loop); + if (birdloop_current && birdloop_current->poll_active) + wakeup_kick(birdloop_current); } @@ -195,15 +179,13 @@ events_fire(struct birdloop *loop) void ev2_schedule(event *e) { - struct birdloop *loop = birdloop_current(); - - if (loop->poll_active && EMPTY_LIST(loop->event_list)) - wakeup_kick(loop); + if (birdloop_current->poll_active && EMPTY_LIST(birdloop_current->event_list)) + wakeup_kick(birdloop_current); if (e->n.next) rem_node(&e->n); - add_tail(&loop->event_list, &e->n); + add_tail(&birdloop_current->event_list, &e->n); } @@ -238,9 +220,7 @@ sockets_add(struct birdloop *loop, sock *s) void sk_start(sock *s) { - struct birdloop *loop = birdloop_current(); - - sockets_add(loop, s); + sockets_add(birdloop_current, s); } static void @@ -261,14 +241,12 @@ sockets_remove(struct birdloop *loop, sock *s) void sk_stop(sock *s) { - struct birdloop *loop = birdloop_current(); + sockets_remove(birdloop_current, s); - sockets_remove(loop, s); - - if (loop->poll_active) + if (birdloop_current->poll_active) { - loop->close_scheduled = 1; - wakeup_kick(loop); + birdloop_current->close_scheduled = 1; + wakeup_kick(birdloop_current); } else close(s->fd); @@ -392,11 +370,6 @@ static void * birdloop_main(void *arg); struct birdloop * birdloop_new(void) { - /* FIXME: this init should be elsewhere and thread-safe */ - static int init = 0; - if (!init) - { birdloop_init_current(); init = 1; } - pool *p = rp_new(NULL, "Birdloop root"); struct birdloop *loop = mb_allocz(p, sizeof(struct birdloop)); loop->pool = p; From 1db83a507a9ae287815d62733d1337074993b433 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Thu, 4 Feb 2021 15:52:42 +0100 Subject: [PATCH 03/59] Locking subsystem: Just a global BIRD lock to begin with. --- lib/birdlib.h | 1 + lib/lists.c | 2 +- lib/locking.h | 46 ++++++++++++++++++ sysdep/unix/Makefile | 2 +- sysdep/unix/coroutine.c | 102 ++++++++++++++++++++++++++++++++++++++++ sysdep/unix/io.c | 3 ++ sysdep/unix/main.c | 3 ++ 7 files changed, 157 insertions(+), 2 deletions(-) create mode 100644 lib/locking.h create mode 100644 sysdep/unix/coroutine.c diff --git a/lib/birdlib.h b/lib/birdlib.h index dc8bd00f..3dc39d19 100644 --- a/lib/birdlib.h +++ b/lib/birdlib.h @@ -9,6 +9,7 @@ #ifndef _BIRD_BIRDLIB_H_ #define _BIRD_BIRDLIB_H_ +#include "sysdep/config.h" #include "lib/alloca.h" /* Ugly structure offset handling macros */ diff --git a/lib/lists.c b/lib/lists.c index 58d51073..dc2e4cbb 100644 --- a/lib/lists.c +++ b/lib/lists.c @@ -26,7 +26,7 @@ #define _BIRD_LISTS_C_ -#include "nest/bird.h" +#include "lib/birdlib.h" #include "lib/lists.h" LIST_INLINE int diff --git a/lib/locking.h b/lib/locking.h new file mode 100644 index 00000000..eb1bc8fa --- /dev/null +++ b/lib/locking.h @@ -0,0 +1,46 @@ +/* + * BIRD Library -- Locking + * + * (c) 2020--2021 Maria Matejka + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_LOCKING_H_ +#define _BIRD_LOCKING_H_ + +struct domain_generic; + +/* Here define the global lock order; first to last. */ +struct lock_order { + struct domain_generic *the_bird; +}; + +#define LOCK_ORDER_DEPTH (sizeof(struct lock_order) / sizeof(struct domain_generic *)) + +extern _Thread_local struct lock_order locking_stack; +extern _Thread_local struct domain_generic **last_locked; + +#define DOMAIN(type) struct domain__##type +#define DEFINE_DOMAIN(type) DOMAIN(type) { struct domain_generic *type; } + +#define DOMAIN_NEW(type, name) (DOMAIN(type)) { .type = domain_new(name) } +struct domain_generic *domain_new(const char *name); + +#define DOMAIN_NULL(type) (DOMAIN(type)) {} + +#define LOCK_DOMAIN(type, d) do_lock(((d).type), &(locking_stack.type)) +#define UNLOCK_DOMAIN(type, d) do_unlock(((d).type), &(locking_stack.type)) + +/* Internal for locking */ +void do_lock(struct domain_generic *dg, struct domain_generic **lsp); +void do_unlock(struct domain_generic *dg, struct domain_generic **lsp); + +/* Use with care. To be removed in near future. */ +DEFINE_DOMAIN(the_bird); +extern DOMAIN(the_bird) the_bird_domain; + +#define the_bird_lock() LOCK_DOMAIN(the_bird, the_bird_domain) +#define the_bird_unlock() UNLOCK_DOMAIN(the_bird, the_bird_domain) + +#endif diff --git a/sysdep/unix/Makefile b/sysdep/unix/Makefile index d0d36b5f..69cf8131 100644 --- a/sysdep/unix/Makefile +++ b/sysdep/unix/Makefile @@ -1,4 +1,4 @@ -src := alloc.c io.c krt.c log.c main.c random.c +src := alloc.c io.c krt.c log.c main.c random.c coroutine.c obj := $(src-o-files) $(all-daemon) $(cf-local) diff --git a/sysdep/unix/coroutine.c b/sysdep/unix/coroutine.c new file mode 100644 index 00000000..05f101fb --- /dev/null +++ b/sysdep/unix/coroutine.c @@ -0,0 +1,102 @@ +/* + * BIRD Coroutines + * + * (c) 2017 Martin Mares + * (c) 2020 Maria Matejka + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#undef LOCAL_DEBUG + +#undef DEBUG_LOCKING + +#include "lib/birdlib.h" +#include "lib/locking.h" +#include "lib/resource.h" + +/* + * Implementation of coroutines based on POSIX threads + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Locking subsystem + */ + +#define ASSERT_NO_LOCK ASSERT_DIE(last_locked == NULL) + +struct domain_generic { + pthread_mutex_t mutex; + struct domain_generic **prev; + struct lock_order *locked_by; + const char *name; +}; + +#define DOMAIN_INIT(_name) { .mutex = PTHREAD_MUTEX_INITIALIZER, .name = _name } + +static struct domain_generic the_bird_domain_gen = DOMAIN_INIT("The BIRD"); + +DOMAIN(the_bird) the_bird_domain = { .the_bird = &the_bird_domain_gen }; + +struct domain_generic * +domain_new(const char *name) +{ + struct domain_generic *dg = xmalloc(sizeof(struct domain_generic)); + *dg = (struct domain_generic) DOMAIN_INIT(name); + return dg; +} + +void +domain_free(struct domain_generic *dg) +{ + pthread_mutex_destroy(&dg->mutex); + xfree(dg); +} + +_Thread_local struct lock_order locking_stack = {}; +_Thread_local struct domain_generic **last_locked = NULL; + +void do_lock(struct domain_generic *dg, struct domain_generic **lsp) +{ + if (lsp <= last_locked) + bug("Trying to lock in a bad order"); + if (*lsp) + bug("Inconsistent locking stack state on lock"); + + pthread_mutex_lock(&dg->mutex); + + if (dg->prev || dg->locked_by) + bug("Previous unlock not finished correctly"); + dg->prev = last_locked; + *lsp = dg; + last_locked = lsp; + dg->locked_by = &locking_stack; +} + +void do_unlock(struct domain_generic *dg, struct domain_generic **lsp) +{ + if (dg->locked_by != &locking_stack) + bug("Inconsistent domain state on unlock"); + if ((last_locked != lsp) || (*lsp != dg)) + bug("Inconsistent locking stack state on unlock"); + dg->locked_by = NULL; + last_locked = dg->prev; + *lsp = NULL; + dg->prev = NULL; + pthread_mutex_unlock(&dg->mutex); +} + diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index 3d67d0a7..29467867 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -36,6 +36,7 @@ #include "lib/resource.h" #include "lib/socket.h" #include "lib/event.h" +#include "lib/locking.h" #include "lib/timer.h" #include "lib/string.h" #include "nest/iface.h" @@ -2263,7 +2264,9 @@ io_loop(void) /* And finally enter poll() to find active sockets */ watchdog_stop(); + the_bird_unlock(); pout = poll(pfd, nfds, poll_tout); + the_bird_lock(); watchdog_start(); if (pout < 0) diff --git a/sysdep/unix/main.c b/sysdep/unix/main.c index 7e8ea0dc..dabfc554 100644 --- a/sysdep/unix/main.c +++ b/sysdep/unix/main.c @@ -28,6 +28,7 @@ #include "lib/resource.h" #include "lib/socket.h" #include "lib/event.h" +#include "lib/locking.h" #include "lib/timer.h" #include "lib/string.h" #include "nest/route.h" @@ -959,6 +960,8 @@ main(int argc, char **argv) dup2(0, 2); } + the_bird_lock(); + main_thread_init(); write_pid_file(); From 2d7e42cc593b3c99f8eacaaa506194aeb3a5cf85 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Wed, 13 Oct 2021 11:33:48 +0200 Subject: [PATCH 04/59] Type checking in SKIP_BACK macro --- lib/birdlib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/birdlib.h b/lib/birdlib.h index 431b7c0d..2e642d38 100644 --- a/lib/birdlib.h +++ b/lib/birdlib.h @@ -16,7 +16,7 @@ struct align_probe { char x; long int y; }; #define OFFSETOF(s, i) ((size_t) &((s *)0)->i) -#define SKIP_BACK(s, i, p) ((s *)((char *)p - OFFSETOF(s, i))) +#define SKIP_BACK(s, i, p) ({ s *_ptr = ((s *)((char *)p - OFFSETOF(s, i))); ASSERT_DIE(&_ptr->i == p); _ptr; }) #define BIRD_ALIGN(s, a) (((s)+a-1)&~(a-1)) #define CPU_STRUCT_ALIGN (sizeof(struct align_probe)) From 1289c1c5eede5b3d015d06b725d30024ccac51bd Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Mon, 8 Feb 2021 09:51:59 +0100 Subject: [PATCH 05/59] Coroutines: A simple and lightweight parallel execution framework. --- lib/coro.h | 26 ++++++++++++++++ sysdep/unix/coroutine.c | 69 +++++++++++++++++++++++++++++++++++++++++ sysdep/unix/io.c | 27 +++++++++++++++- sysdep/unix/log.c | 34 ++++++++++++++++++-- sysdep/unix/unix.h | 1 + 5 files changed, 153 insertions(+), 4 deletions(-) create mode 100644 lib/coro.h diff --git a/lib/coro.h b/lib/coro.h new file mode 100644 index 00000000..51712b36 --- /dev/null +++ b/lib/coro.h @@ -0,0 +1,26 @@ +/* + * BIRD Coroutines + * + * (c) 2017 Martin Mares + * (c) 2020 Maria Matejka + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_CORO_H_ +#define _BIRD_CORO_H_ + +#include "lib/resource.h" + +/* A completely opaque coroutine handle. */ +struct coroutine; + +/* Coroutines are independent threads bound to pools. + * You request a coroutine by calling coro_run(). + * It is forbidden to free a running coroutine from outside. + * The running coroutine must free itself by rfree() before returning. + */ +struct coroutine *coro_run(pool *, void (*entry)(void *), void *data); + + +#endif diff --git a/sysdep/unix/coroutine.c b/sysdep/unix/coroutine.c index 05f101fb..71847505 100644 --- a/sysdep/unix/coroutine.c +++ b/sysdep/unix/coroutine.c @@ -17,7 +17,14 @@ #include "lib/birdlib.h" #include "lib/locking.h" +#include "lib/coro.h" #include "lib/resource.h" +#include "lib/timer.h" + +/* Using a rather big stack for coroutines to allow for stack-local allocations. + * In real world, the kernel doesn't alloc this memory until it is used. + * */ +#define CORO_STACK_SIZE 1048576 /* * Implementation of coroutines based on POSIX threads @@ -100,3 +107,65 @@ void do_unlock(struct domain_generic *dg, struct domain_generic **lsp) pthread_mutex_unlock(&dg->mutex); } +/* Coroutines */ +struct coroutine { + resource r; + pthread_t id; + pthread_attr_t attr; + void (*entry)(void *); + void *data; +}; + +static _Thread_local _Bool coro_cleaned_up = 0; + +static void coro_free(resource *r) +{ + struct coroutine *c = (void *) r; + ASSERT_DIE(pthread_equal(pthread_self(), c->id)); + pthread_attr_destroy(&c->attr); + coro_cleaned_up = 1; +} + +static struct resclass coro_class = { + .name = "Coroutine", + .size = sizeof(struct coroutine), + .free = coro_free, +}; + +static void *coro_entry(void *p) +{ + struct coroutine *c = p; + ASSERT_DIE(c->entry); + + c->entry(c->data); + ASSERT_DIE(coro_cleaned_up); + + return NULL; +} + +struct coroutine *coro_run(pool *p, void (*entry)(void *), void *data) +{ + ASSERT_DIE(entry); + ASSERT_DIE(p); + + struct coroutine *c = ralloc(p, &coro_class); + + c->entry = entry; + c->data = data; + + int e = 0; + + if (e = pthread_attr_init(&c->attr)) + die("pthread_attr_init() failed: %M", e); + + if (e = pthread_attr_setstacksize(&c->attr, CORO_STACK_SIZE)) + die("pthread_attr_setstacksize(%u) failed: %M", CORO_STACK_SIZE, e); + + if (e = pthread_attr_setdetachstate(&c->attr, PTHREAD_CREATE_DETACHED)) + die("pthread_attr_setdetachstate(PTHREAD_CREATE_DETACHED) failed: %M", e); + + if (e = pthread_create(&c->id, &c->attr, coro_entry, c)) + die("pthread_create() failed: %M", e); + + return c; +} diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index 29467867..40841ea4 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -2176,6 +2176,15 @@ static int short_loops = 0; #define SHORT_LOOP_MAX 10 #define WORK_EVENTS_MAX 10 +static int poll_reload_pipe[2]; + +void +io_loop_reload(void) +{ + char b; + write(poll_reload_pipe[1], &b, 1); +} + void io_loop(void) { @@ -2187,6 +2196,9 @@ io_loop(void) int fdmax = 256; struct pollfd *pfd = xmalloc(fdmax * sizeof(struct pollfd)); + if (pipe(poll_reload_pipe) < 0) + die("pipe(poll_reload_pipe) failed: %m"); + watchdog_start1(); for(;;) { @@ -2205,7 +2217,12 @@ io_loop(void) poll_tout = MIN(poll_tout, timeout); } - nfds = 0; + /* A hack to reload main io_loop() when something has changed asynchronously. */ + pfd[0].fd = poll_reload_pipe[0]; + pfd[0].events = POLLIN; + + nfds = 1; + WALK_LIST(n, sock_list) { pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */ @@ -2277,6 +2294,14 @@ io_loop(void) } if (pout) { + if (pfd[0].revents & POLLIN) + { + /* IO loop reload requested */ + char b; + read(poll_reload_pipe[0], &b, 1); + continue; + } + times_update(&main_timeloop); /* guaranteed to be non-empty */ diff --git a/sysdep/unix/log.c b/sysdep/unix/log.c index a23903b7..dc2b14b3 100644 --- a/sysdep/unix/log.c +++ b/sysdep/unix/log.c @@ -15,6 +15,7 @@ * user's manual. */ +#include #include #include #include @@ -35,6 +36,10 @@ static FILE *dbgf; static list *current_log_list; static char *current_syslog_name; /* NULL -> syslog closed */ +static _Atomic uint max_coro_id = ATOMIC_VAR_INIT(1); +static _Thread_local uint this_coro_id; + +#define THIS_CORO_ID (this_coro_id ?: (this_coro_id = atomic_fetch_add_explicit(&max_coro_id, 1, memory_order_acq_rel))) #include @@ -178,7 +183,7 @@ log_commit(int class, buffer *buf) l->pos += msg_len; } - fprintf(l->fh, "%s <%s> ", tbuf, class_names[class]); + fprintf(l->fh, "%s [%04x] <%s> ", tbuf, THIS_CORO_ID, class_names[class]); } fputs(buf->start, l->fh); fputc('\n', l->fh); @@ -288,6 +293,8 @@ die(const char *msg, ...) exit(1); } +static struct timespec dbg_time_start; + /** * debug - write to debug output * @msg: a printf-like message @@ -300,12 +307,33 @@ debug(const char *msg, ...) { #define MAX_DEBUG_BUFSIZE 16384 va_list args; - char buf[MAX_DEBUG_BUFSIZE]; + char buf[MAX_DEBUG_BUFSIZE], *pos = buf; + int max = MAX_DEBUG_BUFSIZE; va_start(args, msg); if (dbgf) { - if (bvsnprintf(buf, MAX_DEBUG_BUFSIZE, msg, args) < 0) + struct timespec dbg_time; + clock_gettime(CLOCK_MONOTONIC, &dbg_time); + uint nsec; + uint sec; + + if (dbg_time.tv_nsec > dbg_time_start.tv_nsec) + { + nsec = dbg_time.tv_nsec - dbg_time_start.tv_nsec; + sec = dbg_time.tv_sec - dbg_time_start.tv_sec; + } + else + { + nsec = 1000000000 + dbg_time.tv_nsec - dbg_time_start.tv_nsec; + sec = dbg_time.tv_sec - dbg_time_start.tv_sec - 1; + } + + int n = bsnprintf(pos, max, "%u.%09u: [%04x] ", sec, nsec, THIS_CORO_ID); + pos += n; + max -= n; + + if (bvsnprintf(pos, max, msg, args) < 0) bug("Extremely long debug output, split it."); fputs(buf, dbgf); diff --git a/sysdep/unix/unix.h b/sysdep/unix/unix.h index ad85d1ea..313c97c3 100644 --- a/sysdep/unix/unix.h +++ b/sysdep/unix/unix.h @@ -106,6 +106,7 @@ extern volatile sig_atomic_t async_shutdown_flag; void io_init(void); void io_loop(void); +void io_loop_reload(void); void io_log_dump(void); int sk_open_unix(struct birdsock *s, char *name); struct rfile *rf_open(struct pool *, const char *name, const char *mode); From b5ca6a79d36549fcc5b934b8ffc283f15b8da4a5 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Wed, 13 Oct 2021 14:50:02 +0200 Subject: [PATCH 06/59] GDB: SKIP_BACK and linked list tools --- bird-gdb.py | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 1 deletion(-) diff --git a/bird-gdb.py b/bird-gdb.py index 3cf65a9c..c7351d6b 100644 --- a/bird-gdb.py +++ b/bird-gdb.py @@ -123,7 +123,7 @@ class BIRDFLinePrinter(BIRDPrinter): "n": n, "code": str(self.val['items'][n]['fi_code']), } if n % 8 == 0 else str(self.val['items'][n]['fi_code']) for n in range(cnt)])) - + class BIRDFExecStackPrinter(BIRDPrinter): "Print BIRD's struct f_exec_stack" @@ -141,6 +141,118 @@ class BIRDFExecStackPrinter(BIRDPrinter): "n": n } for n in range(cnt-1, -1, -1) ]) + +class BIRD: + def skip_back(t, i, v): + if isinstance(t, str): + t = gdb.lookup_type(t) + elif isinstance(t, gdb.Value): + t = gdb.lookup_type(t.string()) + elif not isinstance(t, gdb.Type): + raise Exception(f"First argument of skip_back(t, i, v) must be a type, got {type(t)}") + + t = t.strip_typedefs() + nullptr = gdb.Value(0).cast(t.pointer()) + + if isinstance(i, gdb.Value): + i = i.string() + elif not isinstance(i, str): + raise Exception(f"Second argument of skip_back(t, i, v) must be a item name, got {type(i)}") + + if not isinstance(v, gdb.Value): + raise Exception(f"Third argument of skip_back(t, i, v) must be a value, got {type(v)}") + if v.type.code != gdb.TYPE_CODE_PTR and v.type.code != gdb.TYPE_CODE_REF: + raise Exception(f"Third argument of skip_back(t, i, v) must be a pointer, is {v.type} ({v.type.code})") + if v.type.target().strip_typedefs() != nullptr[i].type: + raise Exception(f"Third argument of skip_back(t, i, v) points to type {v.type.target().strip_typedefs()}, should be {nullptr[i].type}") + + uintptr_t = gdb.lookup_type("uintptr_t") + taddr = v.dereference().address.cast(uintptr_t) - nullptr[i].address.cast(uintptr_t) + return gdb.Value(taddr).cast(t.pointer()) + + class skip_back_gdb(gdb.Function): + "Given address of a structure item, returns address of the structure, as the SKIP_BACK macro does" + def __init__(self): + gdb.Function.__init__(self, "SKIP_BACK") + + def invoke(self, t, i, v): + return BIRD.skip_back(t, i, v) + + +BIRD.skip_back_gdb() + + +class BIRDList: + def __init__(self, val): + ltype = val.type.strip_typedefs() + if ltype.code != gdb.TYPE_CODE_UNION or ltype.tag != "list": + raise Exception(f"Not a list, is type {ltype}") + + self.head = val["head"] + self.tail_node = val["tail_node"] + + if str(self.head.address) == '0x0': + raise Exception("List head is NULL") + + if str(self.tail_node["prev"].address) == '0x0': + raise Exception("List tail is NULL") + + def walk(self, do): + cur = self.head + while cur.dereference() != self.tail_node: + do(cur) + cur = cur.dereference()["next"] + + +class BIRDListLength(gdb.Function): + """Returns length of the list, as in + print $list_length(routing_tables)""" + def __init__(self): + super(BIRDListLength, self).__init__("list_length") + + def count(self, _): + self.cnt += 1 + + def invoke(self, l): + self.cnt = 0 + BIRDList(l).walk(self.count) + return self.cnt + +BIRDListLength() + +class BIRDListItem(gdb.Function): + """Returns n-th item of the list.""" + def __init__(self): + super(BIRDListItem, self).__init__("list_item") + + class BLException(Exception): + def __init__(self, node, msg): + Exception.__init__(self, msg) + self.node = node + + def count(self, node): + if self.cnt == self.pos: + raise self.BLException(node, "Node found") + + self.cnt += 1 + + def invoke(self, l, n, t=None, item="n"): + self.cnt = 0 + self.pos = n + bl = BIRDList(l) + try: + bl.walk(self.count) + except self.BLException as e: + if t is None: + return e.node + else: + return BIRD.skip_back(t, item, e.node) + + raise Exception("List too short") + +BIRDListItem() + + def register_printers(objfile): objfile.pretty_printers.append(BIRDFInstPrinter.lookup) objfile.pretty_printers.append(BIRDFValPrinter.lookup) From df3264f51ff38c9366398564a9d342a26bc83f37 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Mon, 24 May 2021 13:41:23 +0200 Subject: [PATCH 07/59] Lock position checking allows for safe lock unions --- lib/locking.h | 6 ++---- sysdep/unix/coroutine.c | 22 +++++++++++++++------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/lib/locking.h b/lib/locking.h index eb1bc8fa..eef60154 100644 --- a/lib/locking.h +++ b/lib/locking.h @@ -16,16 +16,14 @@ struct lock_order { struct domain_generic *the_bird; }; -#define LOCK_ORDER_DEPTH (sizeof(struct lock_order) / sizeof(struct domain_generic *)) - extern _Thread_local struct lock_order locking_stack; extern _Thread_local struct domain_generic **last_locked; #define DOMAIN(type) struct domain__##type #define DEFINE_DOMAIN(type) DOMAIN(type) { struct domain_generic *type; } -#define DOMAIN_NEW(type, name) (DOMAIN(type)) { .type = domain_new(name) } -struct domain_generic *domain_new(const char *name); +#define DOMAIN_NEW(type, name) (DOMAIN(type)) { .type = domain_new(name, OFFSETOF(struct lock_order, type)) } +struct domain_generic *domain_new(const char *name, uint order); #define DOMAIN_NULL(type) (DOMAIN(type)) {} diff --git a/sysdep/unix/coroutine.c b/sysdep/unix/coroutine.c index 71847505..2eba142c 100644 --- a/sysdep/unix/coroutine.c +++ b/sysdep/unix/coroutine.c @@ -44,26 +44,31 @@ * Locking subsystem */ +_Thread_local struct lock_order locking_stack = {}; +_Thread_local struct domain_generic **last_locked = NULL; + #define ASSERT_NO_LOCK ASSERT_DIE(last_locked == NULL) struct domain_generic { pthread_mutex_t mutex; + uint order; struct domain_generic **prev; struct lock_order *locked_by; const char *name; }; -#define DOMAIN_INIT(_name) { .mutex = PTHREAD_MUTEX_INITIALIZER, .name = _name } +#define DOMAIN_INIT(_name, _order) { .mutex = PTHREAD_MUTEX_INITIALIZER, .name = _name, .order = _order } -static struct domain_generic the_bird_domain_gen = DOMAIN_INIT("The BIRD"); +static struct domain_generic the_bird_domain_gen = DOMAIN_INIT("The BIRD", OFFSETOF(struct lock_order, the_bird)); DOMAIN(the_bird) the_bird_domain = { .the_bird = &the_bird_domain_gen }; struct domain_generic * -domain_new(const char *name) +domain_new(const char *name, uint order) { + ASSERT_DIE(order < sizeof(struct lock_order)); struct domain_generic *dg = xmalloc(sizeof(struct domain_generic)); - *dg = (struct domain_generic) DOMAIN_INIT(name); + *dg = (struct domain_generic) DOMAIN_INIT(name, order); return dg; } @@ -74,11 +79,11 @@ domain_free(struct domain_generic *dg) xfree(dg); } -_Thread_local struct lock_order locking_stack = {}; -_Thread_local struct domain_generic **last_locked = NULL; - void do_lock(struct domain_generic *dg, struct domain_generic **lsp) { + if ((char *) lsp - (char *) &locking_stack != dg->order) + bug("Trying to lock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack); + if (lsp <= last_locked) bug("Trying to lock in a bad order"); if (*lsp) @@ -96,6 +101,9 @@ void do_lock(struct domain_generic *dg, struct domain_generic **lsp) void do_unlock(struct domain_generic *dg, struct domain_generic **lsp) { + if ((char *) lsp - (char *) &locking_stack != dg->order) + bug("Trying to unlock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack); + if (dg->locked_by != &locking_stack) bug("Inconsistent domain state on unlock"); if ((last_locked != lsp) || (*lsp != dg)) From 8d706aedba42f8ace0084c7b983ddaaaf47dff91 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Mon, 9 Aug 2021 11:51:19 +0200 Subject: [PATCH 08/59] Fixing expensive list checks. Debug only commit. --- lib/lists.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/lists.c b/lib/lists.c index fe2b692b..58d51073 100644 --- a/lib/lists.c +++ b/lib/lists.c @@ -35,11 +35,12 @@ check_list(list *l, node *n) if (!l) { ASSERT_DIE(n); - ASSERT_DIE(n->prev); - do { n = n->prev; } while (n->prev); + node *nn = n; + while (nn->prev) + nn = nn->prev; - l = SKIP_BACK(list, head_node, n); + l = SKIP_BACK(list, head_node, nn); } int seen = 0; @@ -60,7 +61,7 @@ check_list(list *l, node *n) } ASSERT_DIE(cur == &(l->tail_node)); - ASSERT_DIE(!n || (seen == 1)); + ASSERT_DIE(!n || (seen == 1) || (n == &l->head_node) || (n == &l->tail_node)); return 1; } @@ -129,7 +130,7 @@ self_link(node *n) LIST_INLINE void insert_node(node *n, node *after) { - EXPENSIVE_CHECK(check_list(l, after)); + EXPENSIVE_CHECK(check_list(NULL, after)); ASSUME(n->prev == NULL); ASSUME(n->next == NULL); @@ -150,7 +151,7 @@ insert_node(node *n, node *after) LIST_INLINE void rem_node(node *n) { - EXPENSIVE_CHECK(check_list(NULL, n)); + EXPENSIVE_CHECK((n == n->prev) && (n == n->next) || check_list(NULL, n)); node *z = n->prev; node *x = n->next; From a2af807357875f866f149b0f6db4d463d4533204 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Mon, 28 Jun 2021 15:43:45 +0200 Subject: [PATCH 09/59] Debug messages with timestamps. On most of current hardware, getting monotonic clock is fast enough to get it and write for each debug message. --- sysdep/unix/log.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sysdep/unix/log.c b/sysdep/unix/log.c index dc2b14b3..f48588b6 100644 --- a/sysdep/unix/log.c +++ b/sysdep/unix/log.c @@ -439,6 +439,8 @@ done: void log_init_debug(char *f) { + clock_gettime(CLOCK_MONOTONIC, &dbg_time_start); + if (dbgf && dbgf != stderr) fclose(dbgf); if (!f) From c84ed603714db2c42a781f8dbb5b3fd540ff689f Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Fri, 18 Jun 2021 18:23:41 +0200 Subject: [PATCH 10/59] Moved BFD IO loop out of BFD as we want to use it as socket-io coroutine --- proto/bfd/Makefile | 4 ++-- proto/bfd/bfd.h | 2 +- sysdep/unix/Makefile | 2 +- proto/bfd/io.c => sysdep/unix/io-loop.c | 2 +- proto/bfd/io.h => sysdep/unix/io-loop.h | 6 +++--- 5 files changed, 8 insertions(+), 8 deletions(-) rename proto/bfd/io.c => sysdep/unix/io-loop.c (99%) rename proto/bfd/io.h => sysdep/unix/io-loop.h (89%) diff --git a/proto/bfd/Makefile b/proto/bfd/Makefile index 402122fc..267dff98 100644 --- a/proto/bfd/Makefile +++ b/proto/bfd/Makefile @@ -1,6 +1,6 @@ -src := bfd.c io.c packets.c +src := bfd.c packets.c obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files) \ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/bfd/bfd.h b/proto/bfd/bfd.h index 91fdaa60..9d4cbbf8 100644 --- a/proto/bfd/bfd.h +++ b/proto/bfd/bfd.h @@ -22,7 +22,7 @@ #include "lib/string.h" #include "nest/bfd.h" -#include "io.h" +#include "sysdep/unix/io-loop.h" #define BFD_CONTROL_PORT 3784 diff --git a/sysdep/unix/Makefile b/sysdep/unix/Makefile index 69cf8131..07f454ab 100644 --- a/sysdep/unix/Makefile +++ b/sysdep/unix/Makefile @@ -1,4 +1,4 @@ -src := alloc.c io.c krt.c log.c main.c random.c coroutine.c +src := alloc.c io.c io-loop.c krt.c log.c main.c random.c coroutine.c obj := $(src-o-files) $(all-daemon) $(cf-local) diff --git a/proto/bfd/io.c b/sysdep/unix/io-loop.c similarity index 99% rename from proto/bfd/io.c rename to sysdep/unix/io-loop.c index c5f1e024..a15d866a 100644 --- a/proto/bfd/io.c +++ b/sysdep/unix/io-loop.c @@ -15,7 +15,7 @@ #include #include "nest/bird.h" -#include "proto/bfd/io.h" +#include "sysdep/unix/io-loop.h" #include "lib/buffer.h" #include "lib/lists.h" diff --git a/proto/bfd/io.h b/sysdep/unix/io-loop.h similarity index 89% rename from proto/bfd/io.h rename to sysdep/unix/io-loop.h index ec706e9a..d858b04e 100644 --- a/proto/bfd/io.h +++ b/sysdep/unix/io-loop.h @@ -4,8 +4,8 @@ * Can be freely distributed and used under the terms of the GNU GPL. */ -#ifndef _BIRD_BFD_IO_H_ -#define _BIRD_BFD_IO_H_ +#ifndef _BIRD_IO_LOOP_H_ +#define _BIRD_IO_LOOP_H_ #include "nest/bird.h" #include "lib/lists.h" @@ -31,4 +31,4 @@ void birdloop_mask_wakeups(struct birdloop *loop); void birdloop_unmask_wakeups(struct birdloop *loop); -#endif /* _BIRD_BFD_IO_H_ */ +#endif /* _BIRD_IO_LOOP_H_ */ From 445eeaf3df126af2c7b61e71c4f08a583eb4fa60 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Fri, 22 Oct 2021 21:03:25 +0200 Subject: [PATCH 11/59] Split route table event into separate events The former rt_event is dropped in favour of separate table events. This allows for selective corking of NHU and prune. --- nest/proto.c | 16 +++--- nest/protocol.h | 2 +- nest/route.h | 12 ++--- nest/rt-table.c | 139 +++++++++++++++++++----------------------------- 4 files changed, 69 insertions(+), 100 deletions(-) diff --git a/nest/proto.c b/nest/proto.c index ac0fb232..f8e1ba31 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -658,6 +658,8 @@ channel_aux_stopped(void *data) else c->in_table = NULL; + rfree(cat->tab->rp); + mb_free(cat); return channel_check_stopped(c); } @@ -666,7 +668,7 @@ static void channel_aux_import_stopped(struct rt_import_request *req) { struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, push, req); - ASSERT_DIE(cat->stop); + ASSERT_DIE(cat->tab->delete_event); } static void @@ -675,25 +677,23 @@ channel_aux_export_stopped(struct rt_export_request *req) struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); req->hook = NULL; - if (cat->refeed_pending && !cat->stop) + if (cat->refeed_pending && !cat->tab->delete_event) { cat->refeed_pending = 0; rt_request_export(cat->tab, req); } else - ASSERT_DIE(cat->stop); + ASSERT_DIE(cat->tab->delete_event); } static void channel_aux_stop(struct channel_aux_table *cat) { - cat->stop = 1; - rt_stop_import(&cat->push, channel_aux_import_stopped); rt_stop_export(&cat->get, channel_aux_export_stopped); - cat->tab->deleted = channel_aux_stopped; - cat->tab->del_data = cat; + cat->tab->delete_event = ev_new_init(cat->tab->rp, channel_aux_stopped, cat); + rt_unlock_table(cat->tab); } @@ -886,7 +886,6 @@ channel_setup_in_table(struct channel *c, int best) c->in_table->c = c; c->in_table->tab = rt_setup(c->proto->pool, &cat->tab_cf); - self_link(&c->in_table->tab->n); rt_lock_table(c->in_table->tab); rt_request_import(c->in_table->tab, &c->in_table->push); @@ -929,7 +928,6 @@ channel_setup_out_table(struct channel *c) c->out_table->c = c; c->out_table->tab = rt_setup(c->proto->pool, &cat->tab_cf); - self_link(&c->out_table->tab->n); rt_lock_table(c->out_table->tab); rt_request_import(c->out_table->tab, &c->out_table->push); diff --git a/nest/protocol.h b/nest/protocol.h index 2fbd607c..981ca96a 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -568,7 +568,7 @@ struct channel_aux_table { struct rt_import_request push; struct rt_export_request get; rtable *tab; - u8 stop; + event *stop; u8 refeed_pending; }; diff --git a/nest/route.h b/nest/route.h index 7c5fd02f..88a56073 100644 --- a/nest/route.h +++ b/nest/route.h @@ -147,6 +147,7 @@ void fit_copy(struct fib *f, struct fib_iterator *dst, struct fib_iterator *src) struct rtable_config { node n; char *name; + struct config *config; struct rtable *table; struct proto_config *krt_attached; /* Kernel syncer attached to this table */ uint addr_type; /* Type of address data stored in table (NET_*) */ @@ -175,17 +176,17 @@ typedef struct rtable { struct hmap id_map; struct hostcache *hostcache; struct rtable_config *config; /* Configuration of this table */ - void (*deleted)(void *); /* Table should free itself. Call this when it is done. */ - void *del_data; - struct event *rt_event; /* Routing table event */ + struct event *prune_event; /* Event to prune abandoned routes */ + struct event *ec_event; /* Event to prune finished exports */ + struct event *hcu_event; /* Event to update host cache */ + struct event *nhu_event; /* Event to update next hops */ + struct event *delete_event; /* Event to delete the table */ btime last_rt_change; /* Last time when route changed */ btime base_settle_time; /* Start time of rtable settling interval */ btime gc_time; /* Time of last GC */ int gc_counter; /* Number of operations since last GC */ byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */ - byte hcu_scheduled; /* Hostcache update is scheduled */ byte nhu_state; /* Next Hop Update state */ - byte export_used; /* Export journal setup scheduled */ struct fib_iterator prune_fit; /* Rtable prune FIB iterator */ struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */ struct tbf rl_pipe; /* Rate limiting token buffer for pipe collisions */ @@ -475,7 +476,6 @@ void rt_unlock_table(rtable *); void rt_subscribe(rtable *tab, struct rt_subscription *s); void rt_unsubscribe(struct rt_subscription *s); rtable *rt_setup(pool *, struct rtable_config *); -static inline void rt_shutdown(rtable *r) { rfree(r->rp); } static inline net *net_find(rtable *tab, const net_addr *addr) { return (net *) fib_find(&tab->fib, addr); } static inline net *net_find_valid(rtable *tab, const net_addr *addr) diff --git a/nest/rt-table.c b/nest/rt-table.c index 9c12ef56..6dd6587e 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -65,14 +65,14 @@ struct rt_export_block { static void rt_free_hostcache(rtable *tab); static void rt_notify_hostcache(rtable *tab, net *net); -static void rt_update_hostcache(rtable *tab); -static void rt_next_hop_update(rtable *tab); -static inline void rt_prune_table(rtable *tab); +static void rt_update_hostcache(void *tab); +static void rt_next_hop_update(void *tab); +static inline void rt_prune_table(void *tab); static inline void rt_schedule_notify(rtable *tab); static void rt_feed_channel(void *); static inline void rt_export_used(rtable *tab); -static void rt_export_cleanup(rtable *tab); +static void rt_export_cleanup(void *tab); static inline void rte_update_lock(void); static inline void rte_update_unlock(void); @@ -1860,7 +1860,7 @@ rte_dump(struct rte_storage *e) void rt_dump(rtable *t) { - debug("Dump of routing table <%s>%s\n", t->name, t->deleted ? " (deleted)" : ""); + debug("Dump of routing table <%s>%s\n", t->name, t->delete_event ? " (deleted)" : ""); #ifdef DEBUGGING fib_check(&t->fib); #endif @@ -1891,9 +1891,9 @@ rt_dump_all(void) void rt_dump_hooks(rtable *tab) { - debug("Dump of hooks in routing table <%s>%s\n", tab->name, tab->deleted ? " (deleted)" : ""); + debug("Dump of hooks in routing table <%s>%s\n", tab->name, tab->delete_event ? " (deleted)" : ""); debug(" nhu_state=%u hcu_scheduled=%u use_count=%d rt_count=%u\n", - tab->nhu_state, tab->hcu_scheduled, tab->use_count, tab->rt_count); + tab->nhu_state, ev_active(tab->hcu_event), tab->use_count, tab->rt_count); debug(" last_rt_change=%t gc_time=%t gc_counter=%d prune_state=%u\n", tab->last_rt_change, tab->gc_time, tab->gc_counter, tab->prune_state); @@ -1930,21 +1930,11 @@ rt_dump_hooks_all(void) rt_dump_hooks(t); } -static inline void -rt_schedule_hcu(rtable *tab) -{ - if (tab->hcu_scheduled) - return; - - tab->hcu_scheduled = 1; - ev_schedule(tab->rt_event); -} - static inline void rt_schedule_nhu(rtable *tab) { if (tab->nhu_state == NHU_CLEAN) - ev_schedule(tab->rt_event); + ev_schedule(tab->nhu_event); /* state change: * NHU_CLEAN -> NHU_SCHEDULED @@ -1957,7 +1947,7 @@ void rt_schedule_prune(rtable *tab) { if (tab->prune_state == 0) - ev_schedule(tab->rt_event); + ev_schedule(tab->prune_event); /* state change 0->1, 2->3 */ tab->prune_state |= 1; @@ -1969,36 +1959,9 @@ rt_export_used(rtable *tab) if (config->table_debug) log(L_TRACE "%s: Export cleanup requested", tab->name); - if (tab->export_used) - return; - - tab->export_used = 1; - ev_schedule(tab->rt_event); + ev_schedule(tab->ec_event); } -static void -rt_event(void *ptr) -{ - rtable *tab = ptr; - - rt_lock_table(tab); - - if (tab->export_used) - rt_export_cleanup(tab); - - if (tab->hcu_scheduled) - rt_update_hostcache(tab); - - if (tab->nhu_state) - rt_next_hop_update(tab); - - if (tab->prune_state) - rt_prune_table(tab); - - rt_unlock_table(tab); -} - - static inline btime rt_settled_time(rtable *tab) { @@ -2078,13 +2041,9 @@ rt_free(resource *_r) DBG("Deleting routing table %s\n", r->name); ASSERT_DIE(r->use_count == 0); - ASSERT_DIE(r->deleted); - - r->config->table = NULL; - rem_node(&r->n); - - if (r->hostcache) - rt_free_hostcache(r); + ASSERT_DIE(r->rt_count == 0); + ASSERT_DIE(EMPTY_LIST(r->imports)); + ASSERT_DIE(EMPTY_LIST(r->exports)); /* Freed automagically by the resource pool fib_free(&r->fib); @@ -2142,7 +2101,11 @@ rt_setup(pool *pp, struct rtable_config *cf) init_list(&t->pending_exports); init_list(&t->subscribers); - t->rt_event = ev_new_init(p, rt_event, t); + t->ec_event = ev_new_init(p, rt_export_cleanup, t); + t->prune_event = ev_new_init(p, rt_prune_table, t); + t->hcu_event = ev_new_init(p, rt_update_hostcache, t); + t->nhu_event = ev_new_init(p, rt_next_hop_update, t); + t->export_timer = tm_new_init(p, rt_announce_exports, t, 0, 0); t->last_rt_change = t->gc_time = current_time(); t->next_export_seq = 1; @@ -2167,7 +2130,6 @@ rt_init(void) init_list(&routing_tables); } - /** * rt_prune_table - prune a routing table * @@ -2183,8 +2145,9 @@ rt_init(void) * iteration. */ static void -rt_prune_table(rtable *tab) +rt_prune_table(void *data) { + rtable *tab = data; struct fib_iterator *fit = &tab->prune_fit; int limit = 512; @@ -2232,7 +2195,7 @@ again: if (limit <= 0) { FIB_ITERATE_PUT(fit); - ev_schedule(tab->rt_event); + ev_schedule(tab->prune_event); return; } @@ -2288,9 +2251,9 @@ again: } static void -rt_export_cleanup(rtable *tab) +rt_export_cleanup(void *data) { - tab->export_used = 0; + rtable *tab = data; u64 min_seq = ~((u64) 0); struct rt_pending_export *last_export_to_free = NULL; @@ -2447,9 +2410,6 @@ done:; imports_stopped = 1; } - if (tab->export_used) - ev_schedule(tab->rt_event); - if (imports_stopped) { if (config->table_debug) @@ -2683,8 +2643,9 @@ rt_next_hop_update_net(rtable *tab, net *n) } static void -rt_next_hop_update(rtable *tab) +rt_next_hop_update(void *data) { + rtable *tab = data; struct fib_iterator *fit = &tab->nhu_fit; int max_feed = 32; @@ -2702,7 +2663,7 @@ rt_next_hop_update(rtable *tab) if (max_feed <= 0) { FIB_ITERATE_PUT(fit); - ev_schedule(tab->rt_event); + ev_schedule(tab->nhu_event); return; } max_feed -= rt_next_hop_update_net(tab, n); @@ -2716,7 +2677,7 @@ rt_next_hop_update(rtable *tab) tab->nhu_state &= 1; if (tab->nhu_state != NHU_CLEAN) - ev_schedule(tab->rt_event); + ev_schedule(tab->nhu_event); } @@ -2738,6 +2699,7 @@ rt_new_table(struct symbol *s, uint addr_type) c->gc_min_time = 5; c->min_settle_time = 1 S; c->max_settle_time = 20 S; + c->config = new_config; add_tail(&new_config->tables, &c->n); @@ -2773,17 +2735,12 @@ rt_lock_table(rtable *r) void rt_unlock_table(rtable *r) { - if (!--r->use_count && r->deleted) - { - void *del_data = r->del_data; - void (*deleted)(void *) = r->deleted; - - /* Delete the routing table by freeing its pool */ - rt_shutdown(r); - deleted(del_data); - } + if (!--r->use_count && r->delete_event) + /* Delete the routing table by freeing its pool */ + ev_schedule(r->delete_event); } + static struct rtable_config * rt_find_table_config(struct config *cf, char *name) { @@ -2791,7 +2748,23 @@ rt_find_table_config(struct config *cf, char *name) return (sym && (sym->class == SYM_TABLE)) ? sym->table : NULL; } -static void rt_config_del_obstacle(void *data) { config_del_obstacle(data); } +static void +rt_done(void *data) +{ + rtable *t = data; + struct rtable_config *tc = t->config; + struct config *c = tc->config; + + tc->table = NULL; + rem_node(&t->n); + + if (t->hostcache) + rt_free_hostcache(t); + + rfree(t->rp); + + config_del_obstacle(c); +} /** * rt_commit - commit new routing table configuration @@ -2816,7 +2789,7 @@ rt_commit(struct config *new, struct config *old) WALK_LIST(o, old->tables) { rtable *ot = o->table; - if (!ot->deleted) + if (!ot->delete_event) { r = rt_find_table_config(new, o->name); if (r && (r->addr_type == o->addr_type) && !new->shutdown) @@ -2832,8 +2805,7 @@ rt_commit(struct config *new, struct config *old) { DBG("\t%s: deleted\n", o->name); rt_lock_table(ot); - ot->deleted = rt_config_del_obstacle; - ot->del_data = old; + ot->delete_event = ev_new_init(ot->rp, rt_done, ot); config_add_obstacle(old); rt_unlock_table(ot); } @@ -3060,11 +3032,11 @@ rt_free_hostcache(rtable *tab) static void rt_notify_hostcache(rtable *tab, net *net) { - if (tab->hcu_scheduled) + if (ev_active(tab->hcu_event)) return; if (trie_match_net(tab->hostcache->trie, net->n.addr)) - rt_schedule_hcu(tab); + ev_schedule(tab->hcu_event); } static int @@ -3161,8 +3133,9 @@ done: } static void -rt_update_hostcache(rtable *tab) +rt_update_hostcache(void *data) { + rtable *tab = data; struct hostcache *hc = tab->hostcache; struct hostentry *he; node *n, *x; @@ -3183,8 +3156,6 @@ rt_update_hostcache(rtable *tab) if (rt_update_hostentry(tab, he)) rt_schedule_nhu(he->tab); } - - tab->hcu_scheduled = 0; } struct hostentry * From 44f26c49f966ca842ff9af55468de0b98c44b73e Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Wed, 6 Oct 2021 15:10:33 +0200 Subject: [PATCH 12/59] Special table hooks rectified. * internal tables are now more standalone, having their own import and export hooks * route refresh/reload uses stale counter instead of stale flag, allowing to drop walking the table at the beginning * route modify (by BGP LLGR) is now done by a special refeed hook, reimporting the modified routes directly without filters --- lib/lists.c | 9 + lib/lists.h | 1 + nest/config.Y | 4 +- nest/proto.c | 432 ++++++++++++++++++++++++------- nest/protocol.h | 23 +- nest/route.h | 35 +-- nest/rt-attr.c | 3 +- nest/rt-show.c | 7 +- nest/rt-table.c | 597 ++++++++++++------------------------------- proto/bgp/attrs.c | 50 ++-- proto/bgp/bgp.c | 94 +++++-- proto/bgp/bgp.h | 3 +- proto/bgp/packets.c | 2 +- proto/rpki/packets.c | 8 +- sysdep/unix/krt.c | 256 ++----------------- sysdep/unix/krt.h | 4 - 16 files changed, 682 insertions(+), 846 deletions(-) diff --git a/lib/lists.c b/lib/lists.c index 200576cf..fe2b692b 100644 --- a/lib/lists.c +++ b/lib/lists.c @@ -109,6 +109,15 @@ add_head(list *l, node *n) l->head = n; } +LIST_INLINE void +self_link(node *n) +{ + ASSUME(n->prev == NULL); + ASSUME(n->next == NULL); + + n->prev = n->next = n; +} + /** * insert_node - insert a node to a list * @n: a new list node diff --git a/lib/lists.h b/lib/lists.h index 479f4ed1..64b4a981 100644 --- a/lib/lists.h +++ b/lib/lists.h @@ -78,6 +78,7 @@ typedef union list { /* In fact two overlayed nodes */ #define LIST_INLINE void add_tail(list *, node *); void add_head(list *, node *); +void self_link(node *); void rem_node(node *); void add_tail_list(list *, list *); void init_list(list *); diff --git a/nest/config.Y b/nest/config.Y index 29d6b0db..a56b25be 100644 --- a/nest/config.Y +++ b/nest/config.Y @@ -644,12 +644,12 @@ r_args: } | r_args IMPORT TABLE channel_arg { if (!$4->in_table) cf_error("No import table in channel %s.%s", $4->proto->name, $4->name); - rt_show_add_table($$, $4->in_table); + rt_show_add_table($$, $4->in_table->tab); $$->tables_defined_by = RSD_TDB_DIRECT; } | r_args EXPORT TABLE channel_arg { if (!$4->out_table) cf_error("No export table in channel %s.%s", $4->proto->name, $4->name); - rt_show_add_table($$, $4->out_table); + rt_show_add_table($$, $4->out_table->tab); $$->tables_defined_by = RSD_TDB_DIRECT; } | r_args FILTER filter { diff --git a/nest/proto.c b/nest/proto.c index 09582d2e..cf448fd9 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -47,7 +47,7 @@ static char *c_states[] = { "DOWN", "START", "UP", "STOP", "RESTART" }; extern struct protocol proto_unix_iface; -static void channel_request_reload(struct channel *c); +static void channel_aux_request_refeed(struct channel_aux_table *cat); static void proto_shutdown_loop(timer *); static void proto_rethink_goal(struct proto *p); static char *proto_state_name(struct proto *p); @@ -88,7 +88,9 @@ channel_export_log_state_change(struct rt_export_request *req, u8 state) switch (state) { case TES_FEEDING: - if (c->proto->feed_begin) + if (c->out_table) + rt_refresh_begin(&c->out_table->push); + else if (c->proto->feed_begin) c->proto->feed_begin(c, !c->refeeding); break; case TES_READY: @@ -179,6 +181,7 @@ proto_find_channel_by_name(struct proto *p, const char *n) } rte * channel_preimport(struct rt_import_request *req, rte *new, rte *old); +rte * channel_in_preimport(struct rt_import_request *req, rte *new, rte *old); void rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe); void rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe); @@ -295,14 +298,10 @@ static void channel_roa_in_changed(struct rt_subscription *s) { struct channel *c = s->data; - int active = c->reload_event && ev_active(c->reload_event); - CD(c, "Reload triggered by RPKI change%s", active ? " - already active" : ""); + CD(c, "Reload triggered by RPKI change"); - if (!active) - channel_request_reload(c); - else - c->reload_pending = 1; + channel_request_reload(c); } static void @@ -444,7 +443,6 @@ channel_start_import(struct channel *c) .dump_req = channel_dump_import_req, .log_state_change = channel_import_log_state_change, .preimport = channel_preimport, - .rte_modify = c->proto->rte_modify, }; ASSERT(c->channel_state == CS_UP); @@ -463,7 +461,8 @@ channel_start_export(struct channel *c) { if (c->out_req.hook) { - log(L_WARN "%s.%s: Attempted to start channel's already started export", c->proto->name, c->name); + c->restart_export = 1; + log(L_WARN "%s.%s: Fast channel export restart", c->proto->name, c->name); return; } @@ -514,7 +513,7 @@ channel_check_stopped(struct channel *c) switch (c->channel_state) { case CS_STOP: - if (c->out_req.hook || c->in_req.hook) + if (c->out_req.hook || c->in_req.hook || c->out_table || c->in_table) return; channel_set_state(c, CS_DOWN); @@ -541,9 +540,6 @@ channel_import_stopped(struct rt_import_request *req) req->hook = NULL; - if (c->in_table) - rt_prune_sync(c->in_table, 1); - mb_free(c->in_req.name); c->in_req.name = NULL; @@ -566,14 +562,16 @@ channel_export_stopped(struct rt_export_request *req) return; } - /* Free the routes from out_table */ - if (c->out_table) - rt_prune_sync(c->out_table, 1); - mb_free(c->out_req.name); c->out_req.name = NULL; - channel_check_stopped(c); + if (c->restart_export) + { + c->restart_export = 0; + channel_start_export(c); + } + else + channel_check_stopped(c); } static void @@ -595,72 +593,296 @@ channel_feed_end(struct channel *c) return; } - if (c->proto->feed_end) + if (c->out_table) + rt_refresh_end(&c->out_table->push); + else if (c->proto->feed_end) c->proto->feed_end(c); if (c->refeed_pending) rt_stop_export(req, channel_export_stopped); - else - c->refeeding = 0; } -/* Called by protocol for reload from in_table */ -void -channel_schedule_reload(struct channel *c) -{ - ASSERT(c->in_req.hook); +#define CHANNEL_AUX_TABLE_DUMP_REQ(inout, imex, pgimex, pushget) static void \ + channel_##inout##_##pushget##_dump_req(struct rt_##pgimex##_request *req) { \ + struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, pushget, req); \ + debug(" Channel %s.%s " #imex " table " #pushget " request %p\n", cat->c->proto->name, cat->c->name, req); } - rt_reload_channel_abort(c); - ev_schedule_work(c->reload_event); +CHANNEL_AUX_TABLE_DUMP_REQ(in, import, import, push) +CHANNEL_AUX_TABLE_DUMP_REQ(in, import, export, get) +CHANNEL_AUX_TABLE_DUMP_REQ(out, export, import, push) +CHANNEL_AUX_TABLE_DUMP_REQ(out, export, export, get) + +#undef CHANNEL_AUX_TABLE_DUMP_REQ + +static uint channel_aux_imex(struct channel_aux_table *cat) +{ + if (cat->c->in_table == cat) + return 0; + else if (cat->c->out_table == cat) + return 1; + else + bug("Channel aux table must be in_table or out_table"); } static void -channel_reload_loop(void *ptr) +channel_aux_stopped(void *data) { - struct channel *c = ptr; + struct channel_aux_table *cat = data; + struct channel *c = cat->c; - /* Start reload */ - if (!c->reload_active) - c->reload_pending = 0; + if (channel_aux_imex(cat)) + c->out_table = NULL; + else + c->in_table = NULL; - if (!rt_reload_channel(c)) + mb_free(cat); + return channel_check_stopped(c); +} + +static void +channel_aux_import_stopped(struct rt_import_request *req) +{ + struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, push, req); + ASSERT_DIE(cat->stop); +} + +static void +channel_aux_export_stopped(struct rt_export_request *req) +{ + struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, push, req); + req->hook = NULL; + + if (cat->refeed_pending && !cat->stop) { - ev_schedule_work(c->reload_event); - return; + cat->refeed_pending = 0; + rt_request_export(cat->tab, req); } + else + ASSERT_DIE(cat->stop); +} - /* Restart reload */ - if (c->reload_pending) - channel_request_reload(c); +static void +channel_aux_stop(struct channel_aux_table *cat) +{ + cat->stop = 1; + + rt_stop_import(&cat->push, channel_aux_import_stopped); + rt_stop_export(&cat->get, channel_aux_export_stopped); + + rt_lock_table(cat->tab); + cat->tab->deleted = channel_aux_stopped; + cat->tab->del_data = cat; + rt_unlock_table(cat->tab); +} + +static void +channel_push_log_state_change(struct rt_import_request *req, u8 state) +{ + struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, push, req); + const char *imex = channel_aux_imex(cat) ? "export" : "import"; + CD(cat->c, "Channel %s table import state changed to %s", imex, rt_import_state_name(state)); +} + +static void +channel_get_log_state_change(struct rt_export_request *req, u8 state) +{ + struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); + const char *imex = channel_aux_imex(cat) ? "export" : "import"; + CD(cat->c, "Channel %s table export state changed to %s", imex, rt_export_state_name(state)); + + switch (state) + { + case TES_FEEDING: + if (imex && cat->c->proto->feed_begin) + cat->c->proto->feed_begin(cat->c, !cat->c->refeeding); + else if (!imex) + rt_refresh_begin(&cat->c->in_req); + break; + + case TES_READY: + if (imex && cat->c->proto->feed_end) + cat->c->proto->feed_end(cat->c); + else if (!imex) + rt_refresh_end(&cat->c->in_req); + + if (cat->refeed_pending) + rt_stop_export(&cat->get, channel_aux_export_stopped); + + break; + } +} + +void rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src *src); + +static void +channel_in_export_one_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) +{ + struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); + + if (!rpe->new && !rpe->old) + return; + + rte n0; + struct rte_src *src = rpe->new ? rpe->new->rte.src : rpe->old->rte.src; + rte_update_direct(cat->c, net, RTES_CLONE(rpe->new, &n0), src); +} + +static void +channel_in_export_one_best(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) +{ + struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); + + if (!rpe->new && !rpe->old) + return; + + rte n0; + struct rte_src *src = rpe->old_best ? rpe->old_best->rte.src : rpe->new_best->rte.src; + rte_update_direct(cat->c, net, RTES_CLONE(rpe->new_best, &n0), src); +} + +static void +channel_in_export_bulk_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) +{ + struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); + for (uint i=0; ic, net, &n0, n0.src); + } +} + +static void +channel_in_export_bulk_best(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) +{ + struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); + if (!count) + return; + + rte n0 = *feed[0]; + rte_update_direct(cat->c, net, &n0, n0.src); +} + +void do_rt_notify_direct(struct channel *c, const net_addr *net, rte *new, const rte *old); + +static void +channel_out_export_one_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) +{ + struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); + rte n0; + do_rt_notify_direct(cat->c, net, RTES_CLONE(rpe->new, &n0), RTES_OR_NULL(rpe->old)); +} + +static void +channel_out_export_one_best(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) +{ + struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); + rte n0; + do_rt_notify_direct(cat->c, net, RTES_CLONE(rpe->new_best, &n0), RTES_OR_NULL(rpe->old_best)); +} + +static void +channel_out_export_bulk(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) +{ + struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); + if (cat->c->ra_mode != RA_ANY) + ASSERT_DIE(count <= 1); + + for (uint i=0; ic, net, &n0, NULL); + } } /* Called by protocol to activate in_table */ void -channel_setup_in_table(struct channel *c) +channel_setup_in_table(struct channel *c, int best) { - struct rtable_config *cf = mb_allocz(c->proto->pool, sizeof(struct rtable_config)); + int nlen = sizeof("import") + strlen(c->name) + strlen(c->proto->name) + 3; - cf->name = "import"; - cf->addr_type = c->net_type; - cf->internal = 1; + struct { + struct channel_aux_table cat; + struct rtable_config tab_cf; + char name[0]; + } *cat = mb_allocz(c->proto->pool, sizeof(*cat) + nlen); - c->in_table = rt_setup(c->proto->pool, cf); + bsprintf(cat->name, "%s.%s.import", c->proto->name, c->name); - c->reload_event = ev_new_init(c->proto->pool, channel_reload_loop, c); + cat->tab_cf.name = cat->name; + cat->tab_cf.addr_type = c->net_type; + + c->in_table = &cat->cat; + c->in_table->push = (struct rt_import_request) { + .name = cat->name, + .trace_routes = c->debug | c->proto->debug, + .dump_req = channel_in_push_dump_req, + .log_state_change = channel_push_log_state_change, + .preimport = channel_in_preimport, + }; + c->in_table->get = (struct rt_export_request) { + .name = cat->name, + .trace_routes = c->debug | c->proto->debug, + .dump_req = channel_in_get_dump_req, + .log_state_change = channel_get_log_state_change, + .export_one = best ? channel_in_export_one_best : channel_in_export_one_any, + .export_bulk = best ? channel_in_export_bulk_best : channel_in_export_bulk_any, + }; + + c->in_table->c = c; + c->in_table->tab = rt_setup(c->proto->pool, &cat->tab_cf); + self_link(&c->in_table->tab->n); + + rt_request_import(c->in_table->tab, &c->in_table->push); + rt_request_export(c->in_table->tab, &c->in_table->get); } /* Called by protocol to activate out_table */ void channel_setup_out_table(struct channel *c) { - struct rtable_config *cf = mb_allocz(c->proto->pool, sizeof(struct rtable_config)); - cf->name = "export"; - cf->addr_type = c->net_type; - cf->internal = 1; + int nlen = sizeof("export") + strlen(c->name) + strlen(c->proto->name) + 3; - c->out_table = rt_setup(c->proto->pool, cf); + struct { + struct channel_aux_table cat; + struct rtable_config tab_cf; + char name[0]; + } *cat = mb_allocz(c->proto->pool, sizeof(*cat) + nlen); + + bsprintf(cat->name, "%s.%s.export", c->proto->name, c->name); + + cat->tab_cf.name = cat->name; + cat->tab_cf.addr_type = c->net_type; + + c->out_table = &cat->cat; + c->out_table->push = (struct rt_import_request) { + .name = cat->name, + .trace_routes = c->debug | c->proto->debug, + .dump_req = channel_out_push_dump_req, + .log_state_change = channel_push_log_state_change, + }; + c->out_table->get = (struct rt_export_request) { + .name = cat->name, + .trace_routes = c->debug | c->proto->debug, + .dump_req = channel_out_get_dump_req, + .log_state_change = channel_get_log_state_change, + .export_one = (c->ra_mode == RA_ANY) ? channel_out_export_one_any : channel_out_export_one_best, + .export_bulk = channel_out_export_bulk, + }; + + c->out_table->c = c; + c->out_table->tab = rt_setup(c->proto->pool, &cat->tab_cf); + self_link(&c->out_table->tab->n); + + rt_request_import(c->out_table->tab, &c->out_table->push); + rt_request_export(c->out_table->tab, &c->out_table->get); } +static void +channel_aux_request_refeed(struct channel_aux_table *cat) +{ + cat->refeed_pending = 1; + rt_stop_export(&cat->get, channel_aux_export_stopped); +} static void channel_do_start(struct channel *c) @@ -686,16 +908,12 @@ channel_do_up(struct channel *c) static void channel_do_pause(struct channel *c) { - /* Need to abort feeding */ - if (c->reload_event) - { - ev_postpone(c->reload_event); - rt_reload_channel_abort(c); - } - /* Stop export */ if (c->out_req.hook) + { rt_stop_export(&c->out_req, channel_export_stopped); + c->refeeding = 0; + } channel_roa_unsubscribe_all(c); @@ -706,6 +924,13 @@ channel_do_pause(struct channel *c) static void channel_do_stop(struct channel *c) { + /* Drop auxiliary tables */ + if (c->in_table) + channel_aux_stop(c->in_table); + + if (c->out_table) + channel_aux_stop(c->out_table); + /* Stop import */ if (c->in_req.hook) rt_stop_import(&c->in_req, channel_import_stopped); @@ -716,16 +941,13 @@ channel_do_stop(struct channel *c) CALL(c->channel->shutdown, c); - /* This have to be done in here, as channel pool is freed before channel_do_down() */ - c->in_table = NULL; - c->reload_event = NULL; - c->out_table = NULL; + channel_roa_unsubscribe_all(c); } static void channel_do_down(struct channel *c) { - ASSERT(!c->reload_active); + ASSERT(!c->out_req.hook && !c->in_req.hook && !c->out_table && !c->in_table); c->proto->active_channels--; @@ -733,14 +955,12 @@ channel_do_down(struct channel *c) memset(&c->import_stats, 0, sizeof(struct channel_import_stats)); memset(&c->export_stats, 0, sizeof(struct channel_export_stats)); - c->in_table = NULL; - c->reload_event = NULL; - c->out_table = NULL; - - /* The in_table and out_table are going to be freed by freeing their resource pools. */ - CALL(c->channel->cleanup, c); + /* This have to be done in here, as channel pool is freed before channel_do_down() */ + bmap_free(&c->export_map); + bmap_free(&c->export_reject_map); + /* Schedule protocol shutddown */ if (proto_is_done(c->proto)) ev_schedule(c->proto->event); @@ -769,7 +989,7 @@ channel_set_state(struct channel *c, uint state) break; case CS_UP: - ASSERT(cs == CS_DOWN || cs == CS_START); + ASSERT(cs == CS_DOWN || cs == CS_START || cs == CS_PAUSE); if (cs == CS_DOWN) channel_do_start(c); @@ -819,8 +1039,8 @@ channel_set_state(struct channel *c, uint state) * completed, it will switch back to ES_READY. This function can be called * even when feeding is already running, in that case it is restarted. */ -void -channel_request_feeding(struct channel *c) +static void +channel_request_table_feeding(struct channel *c) { ASSERT(c->out_req.hook); @@ -828,7 +1048,18 @@ channel_request_feeding(struct channel *c) rt_stop_export(&c->out_req, channel_export_stopped); } -static void +void +channel_request_feeding(struct channel *c) +{ + ASSERT(c->out_req.hook); + + if (c->out_table) + channel_aux_request_refeed(c->out_table); + else + channel_request_table_feeding(c); +} + +void channel_request_reload(struct channel *c) { ASSERT(c->in_req.hook); @@ -836,14 +1067,29 @@ channel_request_reload(struct channel *c) CD(c, "Reload requested"); - c->proto->reload_routes(c); + if (c->in_table) + channel_aux_request_refeed(c->in_table); + else + c->proto->reload_routes(c); +} - /* - * Should this be done before reload_routes() hook? - * Perhaps, but routes are updated asynchronously. - */ - channel_reset_limit(c, &c->rx_limit, PLD_RX); - channel_reset_limit(c, &c->in_limit, PLD_IN); +void +channel_refresh_begin(struct channel *c) +{ + CD(c, "Channel route refresh begin"); + if (c->in_table) + rt_refresh_begin(&c->in_table->push); + else + rt_refresh_begin(&c->in_req); +} + +void +channel_refresh_end(struct channel *c) +{ + if (c->in_table) + rt_refresh_end(&c->in_table->push); + else + rt_refresh_end(&c->in_req); } const struct channel_class channel_basic = { @@ -1001,7 +1247,7 @@ channel_reconfigure(struct channel *c, struct channel_config *cf) channel_request_reload(c); if (export_changed) - channel_request_feeding(c); + channel_request_table_feeding(c); done: CD(c, "Reconfigured"); @@ -1714,7 +1960,7 @@ protos_dump_all(void) WALK_LIST(p, proto_list) { #define DPF(x) (p->x ? " " #x : "") - debug(" protocol %s (%p) state %s with %d active channels flags: %s%s%s%s%s\n", + debug(" protocol %s (%p) state %s with %d active channels flags: %s%s%s%s\n", p->name, p, p_states[p->proto_state], p->active_channels, DPF(disabled), DPF(active), DPF(do_start), DPF(do_stop), DPF(reconfiguring)); #undef DPF @@ -1730,6 +1976,20 @@ protos_dump_all(void) debug("\tChannel state: %s/%s/%s\n", c_states[c->channel_state], c->in_req.hook ? rt_import_state_name(rt_import_get_state(c->in_req.hook)) : "-", c->out_req.hook ? rt_export_state_name(rt_export_get_state(c->out_req.hook)) : "-"); + if (c->in_table) + { + debug("\tInput aux table:\n"); + rt_dump_hooks(c->in_table->tab); + rt_dump(c->in_table->tab); + debug("\tEnd of input aux table.\n"); + } + if (c->out_table) + { + debug("\tOutput aux table:\n"); + rt_dump_hooks(c->in_table->tab); + rt_dump(c->in_table->tab); + debug("\tEnd of output aux table.\n"); + } } if (p->proto->dump && (p->proto_state != PS_DOWN)) @@ -2151,11 +2411,11 @@ channel_show_stats(struct channel *c) cli_msg(-1006, " Routes: %u imported, %u exported, %u preferred", in_routes, out_routes, SRI(pref)); - cli_msg(-1006, " Route change stats: received rejected filtered ignored RX limit IN limit accepted"); - cli_msg(-1006, " Import updates: %10u %10u %10u %10u %10u %10u %10u", + cli_msg(-1006, " Route change stats: received rejected filtered ignored limited accepted"); + cli_msg(-1006, " Import updates: %10u %10u %10u %10u %10u %10u", SCI(updates_received), SCI(updates_invalid), SCI(updates_filtered), SRI(updates_ignored), - SCI(updates_limited_rx), SCI(updates_limited_in), + SCI(updates_limited_rx) + SCI(updates_limited_in), SRI(updates_accepted)); cli_msg(-1006, " Import withdraws: %10u %10u --- %10u --- %10u", SCI(withdraws_received), SCI(withdraws_invalid), diff --git a/nest/protocol.h b/nest/protocol.h index 7447cbf0..f9996b18 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -207,7 +207,6 @@ struct proto { int (*rte_recalculate)(struct rtable *, struct network *, struct rte *, struct rte *, struct rte *); int (*rte_better)(struct rte *, struct rte *); int (*rte_mergable)(struct rte *, struct rte *); - struct rte *(*rte_modify)(struct rte *, struct linpool *); void (*rte_insert)(struct network *, struct rte *); void (*rte_remove)(struct network *, struct rte *); u32 (*rte_igp_metric)(struct rte *); @@ -544,24 +543,29 @@ struct channel { u8 reloadable; /* Hook reload_routes() is allowed on the channel */ u8 gr_lock; /* Graceful restart mechanism should wait for this channel */ u8 gr_wait; /* Route export to channel is postponed until graceful restart */ + u8 restart_export; /* Route export should restart as soon as it stops */ btime last_state_change; /* Time of last state transition */ - struct rtable *in_table; /* Internal table for received routes */ - struct event *reload_event; /* Event responsible for reloading from in_table */ - struct fib_iterator reload_fit; /* FIB iterator in in_table used during reloading */ - struct rte_storage *reload_next_rte; /* Route iterator in in_table used during reloading */ - u8 reload_active; /* Iterator reload_fit is linked */ + struct channel_aux_table *in_table; /* Internal table for received routes */ u8 reload_pending; /* Reloading and another reload is scheduled */ u8 refeed_pending; /* Refeeding and another refeed is scheduled */ u8 rpki_reload; /* RPKI changes trigger channel reload */ - struct rtable *out_table; /* Internal table for exported routes */ + struct channel_aux_table *out_table; /* Internal table for exported routes */ list roa_subscriptions; /* List of active ROA table subscriptions based on filters roa_check() */ }; +struct channel_aux_table { + struct channel *c; + struct rt_import_request push; + struct rt_export_request get; + rtable *tab; + u8 stop; + u8 refeed_pending; +}; /* * Channel states @@ -627,7 +631,7 @@ struct channel *proto_add_channel(struct proto *p, struct channel_config *cf); int proto_configure_channel(struct proto *p, struct channel **c, struct channel_config *cf); void channel_set_state(struct channel *c, uint state); -void channel_setup_in_table(struct channel *c); +void channel_setup_in_table(struct channel *c, int best); void channel_setup_out_table(struct channel *c); void channel_schedule_reload(struct channel *c); @@ -636,6 +640,9 @@ static inline void channel_open(struct channel *c) { channel_set_state(c, CS_UP) static inline void channel_close(struct channel *c) { channel_set_state(c, CS_STOP); } void channel_request_feeding(struct channel *c); +void channel_request_reload(struct channel *c); +void channel_refresh_begin(struct channel *c); +void channel_refresh_end(struct channel *c); void *channel_config_new(const struct channel_class *cc, const char *name, uint net_type, struct proto_config *proto); void *channel_config_get(const struct channel_class *cc, const char *name, uint net_type, struct proto_config *proto); int channel_reconfigure(struct channel *c, struct channel_config *cf); diff --git a/nest/route.h b/nest/route.h index d0568133..cb66be2a 100644 --- a/nest/route.h +++ b/nest/route.h @@ -150,7 +150,6 @@ struct rtable_config { int gc_max_ops; /* Maximum number of operations before GC is run */ int gc_min_time; /* Minimum time between two consecutive GC runs */ byte sorted; /* Routes of network are sorted according to rte_better() */ - byte internal; /* Internal table of a protocol */ btime min_settle_time; /* Minimum settle time for notifications */ btime max_settle_time; /* Maximum settle time for notifications */ }; @@ -172,10 +171,8 @@ typedef struct rtable { struct hmap id_map; struct hostcache *hostcache; struct rtable_config *config; /* Configuration of this table */ - struct config *deleted; /* Table doesn't exist in current configuration, - * delete as soon as use_count becomes 0 and remove - * obstacle from this routing table. - */ + void (*deleted)(void *); /* Table should free itself. Call this when it is done. */ + void *del_data; struct event *rt_event; /* Routing table event */ btime last_rt_change; /* Last time when route changed */ btime base_settle_time; /* Start time of rtable settling interval */ @@ -184,7 +181,6 @@ typedef struct rtable { byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */ byte hcu_scheduled; /* Hostcache update is scheduled */ byte nhu_state; /* Next Hop Update state */ - byte internal; /* This table is internal for some other object */ struct fib_iterator prune_fit; /* Rtable prune FIB iterator */ struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */ struct tbf rl_pipe; /* Rate limiting token buffer for pipe collisions */ @@ -249,6 +245,7 @@ typedef struct rte { u8 generation; /* If this route import is based on other previously exported route, this value should be 1 + MAX(generation of the parent routes). Otherwise the route is independent and this value is zero. */ + u8 stale_cycle; /* Auxiliary value for route refresh */ } rte; struct rte_storage { @@ -256,13 +253,11 @@ struct rte_storage { struct rte rte; /* Route data */ }; -#define RTE_COPY(r, l) ((r) ? (((*(l)) = (r)->rte), (l)) : NULL) -#define RTE_OR_NULL(r) ((r) ? &((r)->rte) : NULL) +#define RTES_CLONE(r, l) ((r) ? (((*(l)) = (r)->rte), (l)) : NULL) +#define RTES_OR_NULL(r) ((r) ? &((r)->rte) : NULL) #define REF_FILTERED 2 /* Route is rejected by import filter */ -#define REF_STALE 4 /* Route is stale in a refresh cycle */ -#define REF_DISCARD 8 /* Route is scheduled for discard */ -#define REF_MODIFY 16 /* Route is scheduled for modify */ +#define REF_USE_STALE 4 /* Do not reset route's stale_cycle to the actual value */ /* Route is valid for propagation (may depend on other flags in the future), accepts NULL */ static inline int rte_is_valid(const rte *r) { return r && !(r->flags & REF_FILTERED); } @@ -283,7 +278,6 @@ struct rt_import_request { /* Preimport is called when the @new route is just-to-be inserted, replacing @old. * Return a route (may be different or modified in-place) to continue or NULL to withdraw. */ struct rte *(*preimport)(struct rt_import_request *req, struct rte *new, struct rte *old); - struct rte *(*rte_modify)(struct rte *, struct linpool *); }; struct rt_import_hook { @@ -303,6 +297,10 @@ struct rt_import_hook { btime last_state_change; /* Time of last state transition */ u8 import_state; /* IS_* */ + u8 stale_set; /* Set this stale_cycle to imported routes */ + u8 stale_valid; /* Routes with this stale_cycle and bigger are considered valid */ + u8 stale_pruned; /* Last prune finished when this value was set at stale_valid */ + u8 stale_pruning; /* Last prune started when this value was set at stale_valid */ void (*stopped)(struct rt_import_request *); /* Stored callback when import is stopped */ }; @@ -455,9 +453,9 @@ void *net_route(rtable *tab, const net_addr *n); int net_roa_check(rtable *tab, const net_addr *n, u32 asn); int rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter); rte *rt_export_merged(struct channel *c, rte ** feed, uint count, linpool *pool, int silent); -void rt_refresh_begin(rtable *t, struct rt_import_request *); -void rt_refresh_end(rtable *t, struct rt_import_request *); -void rt_modify_stale(rtable *t, struct rt_import_request *); + +void rt_refresh_begin(struct rt_import_request *); +void rt_refresh_end(struct rt_import_request *); void rt_schedule_prune(rtable *t); void rte_dump(struct rte_storage *); void rte_free(struct rte_storage *, rtable *); @@ -466,15 +464,9 @@ void rt_dump(rtable *); void rt_dump_all(void); void rt_dump_hooks(rtable *); void rt_dump_hooks_all(void); -int rt_reload_channel(struct channel *c); -void rt_reload_channel_abort(struct channel *c); -void rt_refeed_channel(struct channel *c); void rt_prune_sync(rtable *t, int all); -int rte_update_in(struct channel *c, const net_addr *n, rte *new, struct rte_src *src); -int rte_update_out(struct channel *c, const net_addr *n, rte *new, const rte *old, struct rte_storage **old_exported); struct rtable_config *rt_new_table(struct symbol *s, uint addr_type); - /* Default limit for ECMP next hops, defined in sysdep code */ extern const int rt_default_ecmp; @@ -789,6 +781,7 @@ void rta__free(rta *r); static inline void rta_free(rta *r) { if (r && !--r->uc) rta__free(r); } rta *rta_do_cow(rta *o, linpool *lp); static inline rta * rta_cow(rta *r, linpool *lp) { return rta_is_cached(r) ? rta_do_cow(r, lp) : r; } +static inline void rta_uncache(rta *r) { r->cached = 0; r->uc = 0; } void rta_dump(rta *); void rta_dump_all(void); void rta_show(struct cli *, rta *); diff --git a/nest/rt-attr.c b/nest/rt-attr.c index 1bece201..77fd3c3b 100644 --- a/nest/rt-attr.c +++ b/nest/rt-attr.c @@ -1245,8 +1245,7 @@ rta_do_cow(rta *o, linpool *lp) memcpy(*nhn, nho, nexthop_size(nho)); nhn = &((*nhn)->next); } - r->cached = 0; - r->uc = 0; + rta_uncache(r); return r; } diff --git a/nest/rt-show.c b/nest/rt-show.c index 235d72e4..d942b8e1 100644 --- a/nest/rt-show.c +++ b/nest/rt-show.c @@ -95,7 +95,10 @@ rt_show_rte(struct cli *c, byte *ia, rte *e, struct rt_show_data *d, int primary } if (d->verbose) + { + cli_printf(c, -1008, "\tInternal route ID: %uL %uG %uS", e->src->private_id, e->src->global_id, e->stale_cycle); rta_show(c, a); + } } static uint @@ -103,7 +106,7 @@ rte_feed_count(net *n) { uint count = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTE_OR_NULL(e))) + if (rte_is_valid(RTES_OR_NULL(e))) count++; return count; } @@ -113,7 +116,7 @@ rte_feed_obtain(net *n, rte **feed, uint count) { uint i = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTE_OR_NULL(e))) + if (rte_is_valid(RTES_OR_NULL(e))) { ASSERT_DIE(i < count); feed[i++] = &e->rte; diff --git a/nest/rt-table.c b/nest/rt-table.c index e7ff2816..2f480992 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -50,7 +50,6 @@ pool *rt_table_pool; static linpool *rte_update_pool; list routing_tables; -list deleted_routing_tables; static void rt_free_hostcache(rtable *tab); static void rt_notify_hostcache(rtable *tab, net *net); @@ -386,9 +385,11 @@ rte_mergable(rte *pri, rte *sec) static void rte_trace(const char *name, const rte *e, int dir, const char *msg) { - log(L_TRACE "%s %c %s %N %uL %uG %s", - name, dir, msg, e->net, e->src->private_id, e->src->global_id, - rta_dest_name(e->attrs->dest)); + log(L_TRACE "%s %c %s %N src %uL %uG %uS %s%s", + name, dir, msg, e->net, + e->src->private_id, e->src->global_id, e->stale_cycle, + rta_dest_name(e->attrs->dest), + rte_is_filtered(e) ? " (filtered)" : ""); } static inline void @@ -427,7 +428,7 @@ rte_feed_count(net *n) { uint count = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTE_OR_NULL(e))) + if (rte_is_valid(RTES_OR_NULL(e))) count++; return count; } @@ -437,7 +438,7 @@ rte_feed_obtain(net *n, struct rte **feed, uint count) { uint i = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTE_OR_NULL(e))) + if (rte_is_valid(RTES_OR_NULL(e))) { ASSERT_DIE(i < count); feed[i++] = &e->rte; @@ -508,10 +509,11 @@ export_filter(struct channel *c, rte *rt, int silent) return export_filter_(c, rt, rte_update_pool, silent); } +void do_rt_notify_direct(struct channel *c, const net_addr *net, rte *new, const rte *old); + static void do_rt_notify(struct channel *c, const net_addr *net, rte *new, const rte *old) { - struct proto *p = c->proto; struct channel_export_stats *stats = &c->export_stats; if (c->refeeding && new) @@ -528,28 +530,31 @@ do_rt_notify(struct channel *c, const net_addr *net, rte *new, const rte *old) if (!new && old) CHANNEL_LIMIT_POP(c, OUT); - /* Apply export table */ - struct rte_storage *old_exported = NULL; - if (c->out_table) - { - if (!rte_update_out(c, net, new, old, &old_exported)) - { - channel_rte_trace_out(D_ROUTES, c, new, "idempotent"); - return; - } - } - - if (new) - stats->updates_accepted++; - else - stats->withdraws_accepted++; - + /* Store route export state */ if (old) bmap_clear(&c->export_map, old->id); if (new) bmap_set(&c->export_map, new->id); + /* Apply export table */ + if (c->out_table) + rte_import(&c->out_table->push, net, new, old ? old->src : new->src); + else + do_rt_notify_direct(c, net, new, old); +} + +void +do_rt_notify_direct(struct channel *c, const net_addr *net, rte *new, const rte *old) +{ + struct proto *p = c->proto; + struct channel_export_stats *stats = &c->export_stats; + + if (new) + stats->updates_accepted++; + else + stats->withdraws_accepted++; + if (p->debug & D_ROUTES) { if (new && old) @@ -560,10 +565,7 @@ do_rt_notify(struct channel *c, const net_addr *net, rte *new, const rte *old) channel_rte_trace_out(D_ROUTES, c, old, "removed"); } - p->rt_notify(p, c, net, new, old_exported ? &old_exported->rte : old); - - if (c->out_table && old_exported) - rte_free(old_exported, c->out_table); + p->rt_notify(p, c, net, new, old); } static void @@ -784,7 +786,7 @@ rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_ rte n0; if (rpe->new_best != rpe->old_best) - rt_notify_basic(c, net, RTE_COPY(rpe->new_best, &n0), RTE_OR_NULL(rpe->old_best)); + rt_notify_basic(c, net, RTES_CLONE(rpe->new_best, &n0), RTES_OR_NULL(rpe->old_best)); /* Drop the old stored rejection if applicable. * new->id == old->id happens when updating hostentries. */ @@ -799,7 +801,7 @@ rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pend rte n0; if (rpe->new != rpe->old) - rt_notify_basic(c, net, RTE_COPY(rpe->new, &n0), RTE_OR_NULL(rpe->old)); + rt_notify_basic(c, net, RTES_CLONE(rpe->new, &n0), RTES_OR_NULL(rpe->old)); /* Drop the old stored rejection if applicable. * new->id == old->id happens when updating hostentries. */ @@ -974,6 +976,10 @@ rte_recalculate(struct rt_import_hook *c, net *net, rte *new, struct rte_src *sr rte *old_best = old_best_stored ? &old_best_stored->rte : NULL; rte *old = NULL; + /* Set the stale cycle unless already set */ + if (new && !(new->flags & REF_USE_STALE)) + new->stale_cycle = c->stale_set; + /* Find and remove original route from the same protocol */ struct rte_storage **before_old = rte_find(net, src); @@ -999,8 +1005,7 @@ rte_recalculate(struct rt_import_hook *c, net *net, rte *new, struct rte_src *sr if (new && rte_same(old, new)) { /* No changes, ignore the new route and refresh the old one */ - - old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY); + old->stale_cycle = new->stale_cycle; if (!rte_is_filtered(new)) { @@ -1138,18 +1143,23 @@ rte_recalculate(struct rt_import_hook *c, net *net, rte *new, struct rte_src *sr new_stored->rte.id = old->id; } + _Bool nb = (new_stored == net->routes); + _Bool ob = (old_best == old); + /* Log the route change */ - if (new_ok) - rt_rte_trace_in(D_ROUTES, req, &new_stored->rte, new_stored == net->routes ? "added [best]" : "added"); - else if (old_ok) + if (new_ok && old_ok) { - if (old != old_best) - rt_rte_trace_in(D_ROUTES, req, old, "removed"); - else if (net->routes && rte_is_ok(&net->routes->rte)) - rt_rte_trace_in(D_ROUTES, req, old, "removed [replaced]"); - else - rt_rte_trace_in(D_ROUTES, req, old, "removed [sole]"); + const char *best_indicator[2][2] = { { "updated", "updated [-best]" }, { "updated [+best]", "updated [best]" } }; + rt_rte_trace_in(D_ROUTES, req, &new_stored->rte, best_indicator[nb][ob]); } + else if (new_ok) + rt_rte_trace_in(D_ROUTES, req, &new_stored->rte, + (!net->routes->next || !rte_is_ok(&net->routes->next->rte)) ? "added [sole]" : + nb ? "added [best]" : "added"); + else if (old_ok) + rt_rte_trace_in(D_ROUTES, req, old, + (!net->routes || !rte_is_ok(&net->routes->rte)) ? "removed [sole]" : + ob ? "removed [best]" : "removed"); /* Propagate the route change */ rte_announce(table, net, new_stored, old_stored, @@ -1197,12 +1207,15 @@ channel_preimport(struct rt_import_request *req, rte *new, rte *old) { struct channel *c = SKIP_BACK(struct channel, in_req, req); - if (new && !old) - if (CHANNEL_LIMIT_PUSH(c, RX)) - return NULL; + if (!c->in_table) + { + if (new && !old) + if (CHANNEL_LIMIT_PUSH(c, RX)) + return NULL; - if (!new && old) - CHANNEL_LIMIT_POP(c, RX); + if (!new && old) + CHANNEL_LIMIT_POP(c, RX); + } int new_in = new && !rte_is_filtered(new); int old_in = old && !rte_is_filtered(old); @@ -1223,7 +1236,22 @@ channel_preimport(struct rt_import_request *req, rte *new, rte *old) return new; } -static void rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src *src); +rte * +channel_in_preimport(struct rt_import_request *req, rte *new, rte *old) +{ + struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, push, req); + + if (new && !old) + if (CHANNEL_LIMIT_PUSH(cat->c, RX)) + return NULL; + + if (!new && old) + CHANNEL_LIMIT_POP(cat->c, RX); + + return new; +} + +void rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src *src); void rte_update(struct channel *c, const net_addr *n, rte *new, struct rte_src *src) @@ -1233,13 +1261,14 @@ rte_update(struct channel *c, const net_addr *n, rte *new, struct rte_src *src) ASSERT(c->channel_state == CS_UP); - if (c->in_table && !rte_update_in(c, n, new, src)) - return; - return rte_update_direct(c, n, new, src); + if (c->in_table) + rte_import(&c->in_table->push, n, new, src); + else + rte_update_direct(c, n, new, src); } -static void +void rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src *src) { const struct filter *filter = c->in_filter; @@ -1323,24 +1352,6 @@ rte_discard(net *net, rte *old) /* Non-filtered route deletion, used during garb rte_update_unlock(); } -/* Modify existing route by protocol hook, used for long-lived graceful restart */ -static inline void -rte_modify(net *net, rte *old) -{ - rte_update_lock(); - - rte *new = old->sender->req->rte_modify(old, rte_update_pool); - if (new != old) - { - if (new) - new->flags = old->flags & ~REF_MODIFY; - - rte_recalculate(old->sender, net, new, old->src); - } - - rte_update_unlock(); -} - /* Check rtable for best route to given net whether it would be exported do p */ int rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter) @@ -1419,6 +1430,9 @@ rt_request_import(rtable *tab, struct rt_import_request *req) hook->req = req; hook->table = tab; + if (!hook->stale_set) + hook->stale_set = hook->stale_valid = hook->stale_pruning = hook->stale_pruned = 1; + rt_set_import_state(hook, TIS_UP); hook->n = (node) {}; @@ -1499,20 +1513,41 @@ rt_stop_export(struct rt_export_request *req, void (*stopped)(struct rt_export_r * routes to the routing table (by rte_update()). After that, all protocol * routes (more precisely routes with @c as @sender) not sent during the * refresh cycle but still in the table from the past are pruned. This is - * implemented by marking all related routes as stale by REF_STALE flag in - * rt_refresh_begin(), then marking all related stale routes with REF_DISCARD - * flag in rt_refresh_end() and then removing such routes in the prune loop. - */ + * implemented by setting rte->stale_cycle to req->stale_set in rte_update() + * and then dropping all routes with old stale_cycle values in table prune loop. */ void -rt_refresh_begin(rtable *t, struct rt_import_request *req) +rt_refresh_begin(struct rt_import_request *req) { - FIB_WALK(&t->fib, net, n) - { - for (struct rte_storage *e = n->routes; e; e = e->next) - if (e->rte.sender == req->hook) - e->rte.flags |= REF_STALE; - } - FIB_WALK_END; + struct rt_import_hook *hook = req->hook; + ASSERT_DIE(hook); + + ASSERT_DIE(hook->stale_set == hook->stale_valid); + + /* If the pruning routine is too slow */ + if ((hook->stale_pruned < hook->stale_valid) && (hook->stale_pruned + 128 < hook->stale_valid) + || (hook->stale_pruned > hook->stale_valid) && (hook->stale_pruned > hook->stale_valid + 128)) + { + log(L_WARN "Route refresh flood in table %s", hook->table->name); + FIB_WALK(&hook->table->fib, net, n) + { + for (struct rte_storage *e = n->routes; e; e = e->next) + if (e->rte.sender == req->hook) + e->rte.stale_cycle = 0; + } + FIB_WALK_END; + hook->stale_set = 1; + hook->stale_valid = 0; + hook->stale_pruned = 0; + } + else if (!++hook->stale_set) + { + /* Let's reserve the stale_cycle zero value for always-invalid routes */ + hook->stale_set = 1; + hook->stale_valid = 0; + } + + if (req->trace_routes & D_STATES) + log(L_TRACE "%s: route refresh begin [%u]", req->name, hook->stale_set); } /** @@ -1524,43 +1559,18 @@ rt_refresh_begin(rtable *t, struct rt_import_request *req) * hook. See rt_refresh_begin() for description of refresh cycles. */ void -rt_refresh_end(rtable *t, struct rt_import_request *req) +rt_refresh_end(struct rt_import_request *req) { - int prune = 0; + struct rt_import_hook *hook = req->hook; + ASSERT_DIE(hook); - FIB_WALK(&t->fib, net, n) - { - for (struct rte_storage *e = n->routes; e; e = e->next) - if ((e->rte.sender == req->hook) && (e->rte.flags & REF_STALE)) - { - e->rte.flags |= REF_DISCARD; - prune = 1; - } - } - FIB_WALK_END; + hook->stale_valid++; + ASSERT_DIE(hook->stale_set == hook->stale_valid); - if (prune) - rt_schedule_prune(t); -} + rt_schedule_prune(hook->table); -void -rt_modify_stale(rtable *t, struct rt_import_request *req) -{ - int prune = 0; - - FIB_WALK(&t->fib, net, n) - { - for (struct rte_storage *e = n->routes; e; e = e->next) - if ((e->rte.sender == req->hook) && (e->rte.flags & REF_STALE) && !(e->rte.flags & REF_FILTERED)) - { - e->rte.flags |= REF_MODIFY; - prune = 1; - } - } - FIB_WALK_END; - - if (prune) - rt_schedule_prune(t); + if (req->trace_routes & D_STATES) + log(L_TRACE "%s: route refresh end [%u]", req->name, hook->stale_valid); } /** @@ -1613,9 +1623,6 @@ rt_dump_all(void) WALK_LIST2(t, n, routing_tables, n) rt_dump(t); - - WALK_LIST2(t, n, deleted_routing_tables, n) - rt_dump(t); } void @@ -1658,9 +1665,6 @@ rt_dump_hooks_all(void) WALK_LIST2(t, n, routing_tables, n) rt_dump_hooks(t); - - WALK_LIST2(t, n, deleted_routing_tables, n) - rt_dump_hooks(t); } static inline void @@ -1796,9 +1800,7 @@ rt_free(resource *_r) DBG("Deleting routing table %s\n", r->name); ASSERT_DIE(r->use_count == 0); - - if (r->internal) - return; + ASSERT_DIE(r->deleted); r->config->table = NULL; rem_node(&r->n); @@ -1853,20 +1855,17 @@ rt_setup(pool *pp, struct rtable_config *cf) fib_init(&t->fib, p, t->addr_type, sizeof(net), OFFSETOF(net, n), 0, NULL); - if (!(t->internal = cf->internal)) - { - init_list(&t->imports); - init_list(&t->exports); - hmap_init(&t->id_map, p, 1024); - hmap_set(&t->id_map, 0); + init_list(&t->imports); + init_list(&t->exports); + hmap_init(&t->id_map, p, 1024); + hmap_set(&t->id_map, 0); - init_list(&t->subscribers); + init_list(&t->subscribers); - t->rt_event = ev_new_init(p, rt_event, t); - t->last_rt_change = t->gc_time = current_time(); + t->rt_event = ev_new_init(p, rt_event, t); + t->last_rt_change = t->gc_time = current_time(); - t->rl_pipe = (struct tbf) TBF_DEFAULT_LOG_LIMITS; - } + t->rl_pipe = (struct tbf) TBF_DEFAULT_LOG_LIMITS; return t; } @@ -1884,7 +1883,6 @@ rt_init(void) rt_table_pool = rp_new(&root_pool, "Routing tables"); rte_update_pool = lp_new_default(rt_table_pool); init_list(&routing_tables); - init_list(&deleted_routing_tables); } @@ -1925,6 +1923,13 @@ rt_prune_table(rtable *tab) WALK_LIST2(ih, n, tab->imports, n) if (ih->import_state == TIS_STOP) rt_set_import_state(ih, TIS_FLUSHING); + else if ((ih->stale_valid != ih->stale_pruning) && (ih->stale_pruning == ih->stale_pruned)) + { + ih->stale_pruning = ih->stale_valid; + + if (ih->req->trace_routes & D_STATES) + log(L_TRACE "%s: table prune after refresh begin [%u]", ih->req->name, ih->stale_pruning); + } FIB_ITERATE_INIT(fit, &tab->fib); tab->prune_state = 2; @@ -1936,7 +1941,11 @@ again: rescan: for (struct rte_storage *e=n->routes; e; e=e->next) { - if ((e->rte.sender->import_state == TIS_FLUSHING) || (e->rte.flags & REF_DISCARD)) + struct rt_import_hook *s = e->rte.sender; + + if ((s->import_state == TIS_FLUSHING) || + (e->rte.stale_cycle < s->stale_valid) || + (e->rte.stale_cycle > s->stale_set)) { if (limit <= 0) { @@ -1948,21 +1957,6 @@ again: rte_discard(n, &e->rte); limit--; - goto rescan; - } - - if (e->rte.flags & REF_MODIFY) - { - if (limit <= 0) - { - FIB_ITERATE_PUT(fit); - ev_schedule(tab->rt_event); - return; - } - - rte_modify(n, &e->rte); - limit--; - goto rescan; } } @@ -1998,6 +1992,13 @@ again: mb_free(ih); rt_unlock_table(tab); } + else if (ih->stale_pruning != ih->stale_pruned) + { + ih->stale_pruned = ih->stale_pruning; + + if (ih->req->trace_routes & D_STATES) + log(L_TRACE "%s: table prune after refresh end [%u]", ih->req->name, ih->stale_pruned); + } } void @@ -2204,7 +2205,10 @@ rt_next_hop_update_net(rtable *tab, net *n) for (int i=0; irte.sender->req, &updates[i].new->rte, best_indicator[nb][ob]); rte_announce_i(tab, n, updates[i].new, updates[i].old, new, old_best); } @@ -2308,11 +2312,12 @@ rt_unlock_table(rtable *r) { if (!--r->use_count && r->deleted) { - struct config *conf = r->deleted; + void *del_data = r->del_data; + void (*deleted)(void *) = r->deleted; /* Delete the routing table by freeing its pool */ rt_shutdown(r); - config_del_obstacle(conf); + deleted(del_data); } } @@ -2323,6 +2328,8 @@ rt_find_table_config(struct config *cf, char *name) return (sym && (sym->class == SYM_TABLE)) ? sym->table : NULL; } +static void rt_config_del_obstacle(void *data) { config_del_obstacle(data); } + /** * rt_commit - commit new routing table configuration * @new: new configuration @@ -2361,9 +2368,10 @@ rt_commit(struct config *new, struct config *old) else { DBG("\t%s: deleted\n", o->name); - ot->deleted = old; - config_add_obstacle(old); rt_lock_table(ot); + ot->deleted = rt_config_del_obstacle; + ot->del_data = old; + config_add_obstacle(old); rt_unlock_table(ot); } } @@ -2441,285 +2449,6 @@ done: } -/* - * Import table - */ - -int -rte_update_in(struct channel *c, const net_addr *n, rte *new, struct rte_src *src) -{ - struct rtable *tab = c->in_table; - net *net; - - if (new) - net = net_get(tab, n); - else - { - net = net_find(tab, n); - - if (!net) - goto drop_withdraw; - } - - /* Find the old rte */ - struct rte_storage **pos = rte_find(net, src); - if (*pos) - { - rte *old = &(*pos)->rte; - if (new && rte_same(old, new)) - { - /* Refresh the old rte, continue with update to main rtable */ - if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY)) - { - old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY); - return 1; - } - - goto drop_update; - } - - if (!new) - CHANNEL_LIMIT_POP(c, RX); - - /* Move iterator if needed */ - if (*pos == c->reload_next_rte) - c->reload_next_rte = (*pos)->next; - - /* Remove the old rte */ - struct rte_storage *del = *pos; - *pos = (*pos)->next; - rte_free(del, tab); - tab->rt_count--; - } - else if (new) - { - if (CHANNEL_LIMIT_PUSH(c, RX)) - { - /* Required by rte_trace_in() */ - new->net = n; - - channel_rte_trace_in(D_FILTERS, c, new, "ignored [limit]"); - goto drop_update; - } - } - else - goto drop_withdraw; - - if (!new) - { - if (!net->routes) - fib_delete(&tab->fib, net); - - return 1; - } - - /* Insert the new rte */ - struct rte_storage *e = rte_store(new, net, tab); - e->rte.lastmod = current_time(); - e->next = *pos; - *pos = e; - tab->rt_count++; - return 1; - -drop_update: - c->import_stats.updates_received++; - c->in_req.hook->stats.updates_ignored++; - - if (!net->routes) - fib_delete(&tab->fib, net); - - return 0; - -drop_withdraw: - c->import_stats.withdraws_received++; - c->in_req.hook->stats.withdraws_ignored++; - return 0; -} - -int -rt_reload_channel(struct channel *c) -{ - struct rtable *tab = c->in_table; - struct fib_iterator *fit = &c->reload_fit; - int max_feed = 64; - - ASSERT(c->channel_state == CS_UP); - - if (!c->reload_active) - { - FIB_ITERATE_INIT(fit, &tab->fib); - c->reload_active = 1; - } - - do { - for (struct rte_storage *e = c->reload_next_rte; e; e = e->next) - { - if (max_feed-- <= 0) - { - c->reload_next_rte = e; - debug("%s channel reload burst split (max_feed=%d)", c->proto->name, max_feed); - return 0; - } - - rte r = e->rte; - rte_update_direct(c, r.net, &r, r.src); - } - - c->reload_next_rte = NULL; - - FIB_ITERATE_START(&tab->fib, fit, net, n) - { - if (c->reload_next_rte = n->routes) - { - FIB_ITERATE_PUT_NEXT(fit, &tab->fib); - break; - } - } - FIB_ITERATE_END; - } - while (c->reload_next_rte); - - c->reload_active = 0; - return 1; -} - -void -rt_reload_channel_abort(struct channel *c) -{ - if (c->reload_active) - { - /* Unlink the iterator */ - fit_get(&c->in_table->fib, &c->reload_fit); - c->reload_next_rte = NULL; - c->reload_active = 0; - } -} - -void -rt_prune_sync(rtable *t, int all) -{ - struct fib_iterator fit; - - FIB_ITERATE_INIT(&fit, &t->fib); - -again: - FIB_ITERATE_START(&t->fib, &fit, net, n) - { - struct rte_storage *e, **ee = &n->routes; - - while (e = *ee) - { - if (all || (e->rte.flags & (REF_STALE | REF_DISCARD))) - { - *ee = e->next; - rte_free(e, t); - t->rt_count--; - } - else - ee = &e->next; - } - - if (all || !n->routes) - { - FIB_ITERATE_PUT(&fit); - fib_delete(&t->fib, n); - goto again; - } - } - FIB_ITERATE_END; -} - - -/* - * Export table - */ - -int -rte_update_out(struct channel *c, const net_addr *n, rte *new, const rte *old0, struct rte_storage **old_exported) -{ - struct rtable *tab = c->out_table; - struct rte_src *src; - net *net; - - if (new) - { - net = net_get(tab, n); - src = new->src; - } - else - { - net = net_find(tab, n); - src = old0->src; - - if (!net) - goto drop; - } - - /* Find the old rte */ - struct rte_storage **pos = (c->ra_mode == RA_ANY) ? rte_find(net, src) : &net->routes; - struct rte_storage *old = NULL; - - if (old = *pos) - { - if (new && rte_same(&(*pos)->rte, new)) - goto drop; - - /* Remove the old rte */ - *pos = old->next; - *old_exported = old; - tab->rt_count--; - } - - if (!new) - { - if (!old) - goto drop; - - if (!net->routes) - fib_delete(&tab->fib, net); - - return 1; - } - - /* Insert the new rte */ - struct rte_storage *e = rte_store(new, net, tab); - e->rte.lastmod = current_time(); - e->next = *pos; - *pos = e; - tab->rt_count++; - return 1; - -drop: - return 0; -} - -void -rt_refeed_channel(struct channel *c) -{ - if (!c->out_table) - { - channel_request_feeding(c); - return; - } - - ASSERT_DIE(c->ra_mode != RA_ANY); - - c->proto->feed_begin(c, 0); - - FIB_WALK(&c->out_table->fib, net, n) - { - if (!n->routes) - continue; - - rte e = n->routes->rte; - c->proto->rt_notify(c->proto, c, n->n.addr, &e, NULL); - } - FIB_WALK_END; - - c->proto->feed_end(c); -} - - /* * Hostcache */ diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index 90490b4f..892b26e3 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -2267,30 +2267,44 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) return !old_suppressed; } -rte * -bgp_rte_modify_stale(struct rte *r, struct linpool *pool) +void +bgp_rte_modify_stale(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) { - eattr *ea = ea_find(r->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY)); - const struct adata *ad = ea ? ea->u.ptr : NULL; - uint flags = ea ? ea->flags : BAF_PARTIAL; + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); - if (ad && int_set_contains(ad, BGP_COMM_NO_LLGR)) - return NULL; + do { + rte *r = feed[--count]; + if (r->sender != c->c.in_req.hook) + continue; - if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE)) - return r; + /* A new route, do not mark as stale */ + if (r->stale_cycle == c->c.in_req.hook->stale_set) + continue; - rta *a = rta_do_cow(r->attrs, pool); - - _Thread_local static rte e0; - e0 = *r; - e0.attrs = a; + eattr *ea = ea_find(r->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY)); + const struct adata *ad = ea ? ea->u.ptr : NULL; + uint flags = ea ? ea->flags : BAF_PARTIAL; - bgp_set_attr_ptr(&(a->eattrs), pool, BA_COMMUNITY, flags, - int_set_add(pool, ad, BGP_COMM_LLGR_STALE)); - e0.pflags |= BGP_REF_STALE; + rte e0 = *r; + e0.flags |= REF_USE_STALE; - return &e0; + if (ad && int_set_contains(ad, BGP_COMM_NO_LLGR)) + rte_import(&c->c.in_req, n, NULL, r->src); + + else if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE)) + rte_import(&c->c.in_req, n, &e0, r->src); + + else { + rta *a = e0.attrs = rta_do_cow(r->attrs, bgp_linpool); + + bgp_set_attr_ptr(&(a->eattrs), bgp_linpool, BA_COMMUNITY, flags, + int_set_add(bgp_linpool, ad, BGP_COMM_LLGR_STALE)); + e0.pflags |= BGP_REF_STALE; + + rte_import(&c->c.in_req, n, &e0, r->src); + lp_flush(bgp_linpool); + } + } while (count); } diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 78c36bc7..35e9ea59 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -140,6 +140,15 @@ static void bgp_update_bfd(struct bgp_proto *p, const struct bfd_options *bfd); static int bgp_incoming_connection(sock *sk, uint dummy UNUSED); static void bgp_listen_sock_err(sock *sk UNUSED, int err); +static void bgp_graceful_restart_feed(struct bgp_channel *c); +static inline void channel_refresh_end_reload(struct channel *c) +{ + channel_refresh_end(c); + + if (c->in_table) + channel_request_reload(c); +} + /** * bgp_open - open a BGP instance * @p: BGP instance @@ -775,25 +784,25 @@ bgp_handle_graceful_restart(struct bgp_proto *p) { case BGP_GRS_NONE: c->gr_active = BGP_GRS_ACTIVE; - rt_refresh_begin(c->c.table, &c->c.in_req); + channel_refresh_begin(&c->c); break; case BGP_GRS_ACTIVE: - rt_refresh_end(c->c.table, &c->c.in_req); - rt_refresh_begin(c->c.table, &c->c.in_req); + channel_refresh_end(&c->c); + channel_refresh_begin(&c->c); break; case BGP_GRS_LLGR: - rt_refresh_begin(c->c.table, &c->c.in_req); - rt_modify_stale(c->c.table, &c->c.in_req); + channel_refresh_begin(&c->c); + bgp_graceful_restart_feed(c); break; } } else { /* Just flush the routes */ - rt_refresh_begin(c->c.table, &c->c.in_req); - rt_refresh_end(c->c.table, &c->c.in_req); + channel_refresh_begin(&c->c); + channel_refresh_end(&c->c); } /* Reset bucket and prefix tables */ @@ -811,6 +820,50 @@ bgp_handle_graceful_restart(struct bgp_proto *p) tm_start(p->gr_timer, p->conn->remote_caps->gr_time S); } +static void +bgp_graceful_restart_feed_done(struct rt_export_request *req) +{ + req->hook = NULL; +} + +static void +bgp_graceful_restart_feed_dump_req(struct rt_export_request *req) +{ + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); + debug(" BGP-GR %s.%s export request %p\n", c->c.proto->name, c->c.name, req); +} + +static void +bgp_graceful_restart_feed_log_state_change(struct rt_export_request *req, u8 state) +{ + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); + struct bgp_proto *p = (void *) c->c.proto; + BGP_TRACE(D_EVENTS, "Long-lived graceful restart export state changed to %s", rt_export_state_name(state)); + + if (state == TES_READY) + rt_stop_export(req, bgp_graceful_restart_feed_done); +} + +static void +bgp_graceful_restart_drop_export(struct rt_export_request *req UNUSED, const net_addr *n UNUSED, struct rt_pending_export *rpe UNUSED) +{ /* Nothing to do */ } + +static void +bgp_graceful_restart_feed(struct bgp_channel *c) +{ + c->stale_feed = (struct rt_export_request) { + .name = "BGP-GR", + .trace_routes = c->c.debug | c->c.proto->debug, + .dump_req = bgp_graceful_restart_feed_dump_req, + .log_state_change = bgp_graceful_restart_feed_log_state_change, + .export_bulk = bgp_rte_modify_stale, + .export_one = bgp_graceful_restart_drop_export, + }; + + rt_request_export(c->c.table, &c->stale_feed); +} + + /** * bgp_graceful_restart_done - finish active BGP graceful restart * @c: BGP channel @@ -833,8 +886,11 @@ bgp_graceful_restart_done(struct bgp_channel *c) if (!p->gr_active_num) BGP_TRACE(D_EVENTS, "Neighbor graceful restart done"); + if (c->stale_feed.hook) + rt_stop_export(&c->stale_feed, bgp_graceful_restart_feed_done); + tm_stop(c->stale_timer); - rt_refresh_end(c->c.table, &c->c.in_req); + channel_refresh_end_reload(&c->c); } /** @@ -876,7 +932,7 @@ bgp_graceful_restart_timeout(timer *t) /* Channel is in GR, and supports LLGR -> start LLGR */ c->gr_active = BGP_GRS_LLGR; tm_start(c->stale_timer, c->stale_time S); - rt_modify_stale(c->c.table, &c->c.in_req); + bgp_graceful_restart_feed(c); } } else @@ -914,10 +970,7 @@ bgp_refresh_begin(struct bgp_channel *c) { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; } c->load_state = BFS_REFRESHING; - rt_refresh_begin(c->c.table, &c->c.in_req); - - if (c->c.in_table) - rt_refresh_begin(c->c.in_table, &c->c.in_req); + channel_refresh_begin(&c->c); } /** @@ -938,10 +991,7 @@ bgp_refresh_end(struct bgp_channel *c) { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; } c->load_state = BFS_NONE; - rt_refresh_end(c->c.table, &c->c.in_req); - - if (c->c.in_table) - rt_prune_sync(c->c.in_table, 0); + channel_refresh_end_reload(&c->c); } @@ -1408,12 +1458,9 @@ bgp_reload_routes(struct channel *C) struct bgp_proto *p = (void *) C->proto; struct bgp_channel *c = (void *) C; - ASSERT(p->conn && (p->route_refresh || c->c.in_table)); + ASSERT(p->conn && (p->route_refresh)); - if (c->c.in_table) - channel_schedule_reload(C); - else - bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH); + bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH); } static void @@ -1693,7 +1740,6 @@ bgp_init(struct proto_config *CF) P->rte_better = bgp_rte_better; P->rte_mergable = bgp_rte_mergable; P->rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL; - P->rte_modify = bgp_rte_modify_stale; P->rte_igp_metric = bgp_rte_igp_metric; p->cf = cf; @@ -1756,7 +1802,7 @@ bgp_channel_start(struct channel *C) bgp_init_prefix_table(c); if (c->cf->import_table) - channel_setup_in_table(C); + channel_setup_in_table(C, 0); if (c->cf->export_table) channel_setup_out_table(C); diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index c79dd1b2..342dc023 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -366,6 +366,7 @@ struct bgp_channel { timer *stale_timer; /* Long-lived stale timer for LLGR */ u32 stale_time; /* Stored LLGR stale time from last session */ + struct rt_export_request stale_feed; /* Feeder request for stale route modification */ u8 add_path_rx; /* Session expects receive of ADD-PATH extended NLRI */ u8 add_path_tx; /* Session expects transmit of ADD-PATH extended NLRI */ @@ -585,7 +586,7 @@ void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *bp); int bgp_rte_better(struct rte *, struct rte *); int bgp_rte_mergable(rte *pri, rte *sec); int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best); -struct rte *bgp_rte_modify_stale(struct rte *r, struct linpool *pool); +void bgp_rte_modify_stale(struct rt_export_request *, const net_addr *, struct rt_pending_export *, rte **, uint); u32 bgp_rte_igp_metric(struct rte *); void bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, const rte *old); int bgp_preexport(struct channel *, struct rte *); diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index f1e6d7d2..647551e5 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -2695,7 +2695,7 @@ bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len) { case BGP_RR_REQUEST: BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH"); - rt_refeed_channel(&c->c); + channel_request_feeding(&c->c); break; case BGP_RR_BEGIN: diff --git a/proto/rpki/packets.c b/proto/rpki/packets.c index 943485d7..897edc09 100644 --- a/proto/rpki/packets.c +++ b/proto/rpki/packets.c @@ -661,9 +661,9 @@ rpki_handle_cache_response_pdu(struct rpki_cache *cache, const struct pdu_cache_ * a refresh cycle. */ if (cache->p->roa4_channel) - rt_refresh_begin(cache->p->roa4_channel->table, &cache->p->roa4_channel->in_req); + rt_refresh_begin(&cache->p->roa4_channel->in_req); if (cache->p->roa6_channel) - rt_refresh_begin(cache->p->roa6_channel->table, &cache->p->roa6_channel->in_req); + rt_refresh_begin(&cache->p->roa6_channel->in_req); cache->p->refresh_channels = 1; } @@ -819,9 +819,9 @@ rpki_handle_end_of_data_pdu(struct rpki_cache *cache, const struct pdu_end_of_da { cache->p->refresh_channels = 0; if (cache->p->roa4_channel) - rt_refresh_end(cache->p->roa4_channel->table, &cache->p->roa4_channel->in_req); + rt_refresh_end(&cache->p->roa4_channel->in_req); if (cache->p->roa6_channel) - rt_refresh_end(cache->p->roa6_channel->table, &cache->p->roa6_channel->in_req); + rt_refresh_end(&cache->p->roa6_channel->in_req); } cache->last_update = current_time(); diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index 40a58442..609ee921 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -285,249 +285,24 @@ krt_metric(rte *a) } static inline int -krt_same_key(rte *a, rte *b) +krt_rte_better(rte *a, rte *b) { - return (krt_metric(a) == krt_metric(b)); -} - -static inline int -krt_uptodate(rte *a, rte *b) -{ - return (a->attrs == b->attrs); -} - -static void -krt_learn_announce_update(struct krt_proto *p, rte *e) -{ - rte e0 = { - .attrs = rta_clone(e->attrs), - .src = p->p.main_source, - }; - - rte_update(p->p.main_channel, e->net, &e0, p->p.main_source); -} - -static void -krt_learn_announce_delete(struct krt_proto *p, net_addr *n) -{ - rte_update(p->p.main_channel, n, NULL, p->p.main_source); + return (krt_metric(a) > krt_metric(b)); } /* Called when alien route is discovered during scan */ static void -krt_learn_scan(struct krt_proto *p, rte *e) +krt_learn_rte(struct krt_proto *p, rte *e) { - net *n = net_get(p->krt_table, e->net); - struct rte_storage *m, **mm; - - struct rte_storage *ee = rte_store(e, n, p->krt_table); - - for(mm = &n->routes; m = *mm; mm = &m->next) - if (krt_same_key(&m->rte, e)) - break; - if (m) - { - if (krt_uptodate(&m->rte, e)) - { - krt_trace_in_rl(&rl_alien, p, e, "[alien] seen"); - rte_free(ee, p->krt_table); - m->rte.pflags |= KRT_REF_SEEN; - } - else - { - krt_trace_in(p, e, "[alien] updated"); - *mm = m->next; - rte_free(m, p->krt_table); - m = NULL; - } - } - else - krt_trace_in(p, e, "[alien] created"); - - if (!m) - { - ee->next = n->routes; - n->routes = ee; - ee->rte.pflags |= KRT_REF_SEEN; - } -} - -static void -krt_learn_prune(struct krt_proto *p) -{ - struct fib *fib = &p->krt_table->fib; - struct fib_iterator fit; - - KRT_TRACE(p, D_EVENTS, "Pruning inherited routes"); - - FIB_ITERATE_INIT(&fit, fib); -again: - FIB_ITERATE_START(fib, &fit, net, n) - { - struct rte_storage *e, **ee, *best, **pbest, *old_best; - - /* - * Note that old_best may be NULL even if there was an old best route in - * the previous step, because it might be replaced in krt_learn_scan(). - * But in that case there is a new valid best route. - */ - - old_best = NULL; - best = NULL; - pbest = NULL; - ee = &n->routes; - while (e = *ee) - { - if (e->rte.pflags & KRT_REF_BEST) - old_best = e; - - if (!(e->rte.pflags & KRT_REF_SEEN)) - { - *ee = e->next; - rte_free(e, p->krt_table); - continue; - } - - if (!best || krt_metric(&best->rte) > krt_metric(&e->rte)) - { - best = e; - pbest = ee; - } - - e->rte.pflags &= ~(KRT_REF_SEEN | KRT_REF_BEST); - ee = &e->next; - } - if (!n->routes) - { - DBG("%I/%d: deleting\n", n->n.prefix, n->n.pxlen); - if (old_best) - krt_learn_announce_delete(p, n->n.addr); - - FIB_ITERATE_PUT(&fit); - fib_delete(fib, n); - goto again; - } - - best->rte.pflags |= KRT_REF_BEST; - *pbest = best->next; - best->next = n->routes; - n->routes = best; - - if ((best != old_best) || p->reload) - { - DBG("%I/%d: announcing (metric=%d)\n", n->n.prefix, n->n.pxlen, krt_metric(&best->rte)); - krt_learn_announce_update(p, &best->rte); - } - else - DBG("%I/%d: uptodate (metric=%d)\n", n->n.prefix, n->n.pxlen, krt_metric(&best->rte)); - } - FIB_ITERATE_END; - - p->reload = 0; -} - -static void -krt_learn_async(struct krt_proto *p, rte *e, int new) -{ - net *n = net_get(p->krt_table, e->net); - struct rte_storage *g, **gg, *best, **bestp, *old_best; - - ASSERT(!e->attrs->cached); - e->attrs->pref = p->p.main_channel->preference; - - struct rte_storage *ee = rte_store(e, n, p->krt_table); - - old_best = n->routes; - for(gg=&n->routes; g = *gg; gg = &g->next) - if (krt_same_key(&g->rte, e)) - break; - if (new) - { - if (g) - { - if (krt_uptodate(&g->rte, e)) - { - krt_trace_in(p, e, "[alien async] same"); - rte_free(ee, p->krt_table); - return; - } - krt_trace_in(p, e, "[alien async] updated"); - *gg = g->next; - rte_free(g, p->krt_table); - } - else - krt_trace_in(p, e, "[alien async] created"); - - ee->next = n->routes; - n->routes = ee; - } - else if (!g) - { - krt_trace_in(p, e, "[alien async] delete failed"); - rte_free(ee, p->krt_table); - return; - } - else - { - krt_trace_in(p, e, "[alien async] removed"); - *gg = g->next; - rte_free(ee, p->krt_table); - rte_free(g, p->krt_table); - } - best = n->routes; - bestp = &n->routes; - for(gg=&n->routes; g=*gg; gg=&g->next) - { - if (krt_metric(&best->rte) > krt_metric(&g->rte)) - { - best = g; - bestp = gg; - } - - g->rte.pflags &= ~KRT_REF_BEST; - } - - if (best) - { - best->rte.pflags |= KRT_REF_BEST; - *bestp = best->next; - best->next = n->routes; - n->routes = best; - } - - if (best != old_best) - { - DBG("krt_learn_async: distributing change\n"); - if (best) - krt_learn_announce_update(p, &best->rte); - else - krt_learn_announce_delete(p, n->n.addr); - } + e->src = rt_get_source(&p->p, krt_metric(e)); + rte_update(p->p.main_channel, e->net, e, e->src); } static void krt_learn_init(struct krt_proto *p) { if (KRT_CF->learn) - { - struct rtable_config *cf = mb_allocz(p->p.pool, sizeof(struct rtable_config)); - cf->name = "Inherited"; - cf->addr_type = p->p.net_type; - cf->internal = 1; - - p->krt_table = rt_setup(p->p.pool, cf); - } -} - -static void -krt_dump(struct proto *P) -{ - struct krt_proto *p = (struct krt_proto *) P; - - if (!KRT_CF->learn) - return; - debug("KRT: Table of inheritable routes\n"); - rt_dump(p->krt_table); + channel_setup_in_table(p->p.main_channel, 1); } #endif @@ -547,7 +322,7 @@ rte_feed_count(net *n) { uint count = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTE_OR_NULL(e))) + if (rte_is_valid(RTES_OR_NULL(e))) count++; return count; } @@ -557,7 +332,7 @@ rte_feed_obtain(net *n, rte **feed, uint count) { uint i = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTE_OR_NULL(e))) + if (rte_is_valid(RTES_OR_NULL(e))) { ASSERT_DIE(i < count); feed[i++] = &e->rte; @@ -643,7 +418,7 @@ krt_got_route(struct krt_proto *p, rte *e, s8 src) case KRT_SRC_ALIEN: if (KRT_CF->learn) - krt_learn_scan(p, e); + krt_learn_rte(p, e); else krt_trace_in_rl(&rl_alien, p, e, "[alien] ignored"); return; @@ -712,6 +487,11 @@ static void krt_init_scan(struct krt_proto *p) { bmap_reset(&p->seen_map, 1024); + +#ifdef KRT_ALLOW_LEARN + if (KRT_CF->learn) + channel_refresh_begin(p->p.main_channel); +#endif } static void @@ -739,7 +519,7 @@ krt_prune(struct krt_proto *p) #ifdef KRT_ALLOW_LEARN if (KRT_CF->learn) - krt_learn_prune(p); + channel_refresh_end(p->p.main_channel); #endif if (p->ready) @@ -781,7 +561,7 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new, s8 src) case KRT_SRC_ALIEN: if (KRT_CF->learn) { - krt_learn_async(p, e, new); + krt_learn_rte(p, e); return; } #endif @@ -1027,6 +807,7 @@ krt_init(struct proto_config *CF) p->p.if_notify = krt_if_notify; p->p.reload_routes = krt_reload_routes; p->p.feed_end = krt_feed_end; + p->p.rte_better = krt_rte_better; krt_sys_init(p); return &p->p; @@ -1182,7 +963,4 @@ struct protocol proto_unix_kernel = { .reconfigure = krt_reconfigure, .copy_config = krt_copy_config, .get_attr = krt_get_attr, -#ifdef KRT_ALLOW_LEARN - .dump = krt_dump, -#endif }; diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h index f6ad6fde..968c5b16 100644 --- a/sysdep/unix/krt.h +++ b/sysdep/unix/krt.h @@ -51,10 +51,6 @@ struct krt_proto { struct proto p; struct krt_state sys; /* Sysdep state */ -#ifdef KRT_ALLOW_LEARN - struct rtable *krt_table; /* Internal table of inherited routes */ -#endif - #ifndef CONFIG_ALL_TABLES_AT_ONCE timer *scan_timer; #endif From f18968f52f461946b64aaea6c4a0e88c5235fb07 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Fri, 22 Oct 2021 19:43:55 +0200 Subject: [PATCH 13/59] Better profylaction recursive route loops In some specific configurations, it was possible to send BIRD into an infinite loop of recursive next hop resolution. This was caused by route priority inversion. To prevent priority inversions affecting other next hops, we simply refuse to resolve any next hop if the best route for the matching prefix is recursive or any other route with the same preference is recursive. Next hop resolution doesn't change route priority, therefore it is perfectly OK to resolve BGP next hops e.g. by an OSPF route, yet if the same (or covering) prefix is also announced by iBGP, by retraction of the OSPF route we would get a possible priority inversion. --- nest/rt-table.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/nest/rt-table.c b/nest/rt-table.c index 2f480992..2dcb2e26 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -2649,9 +2649,10 @@ rt_update_hostentry(rtable *tab, struct hostentry *he) { struct rte_storage *e = n->routes; rta *a = e->rte.attrs; - pxlen = n->n.addr->pxlen; + word pref = a->pref; - if (a->hostentry) + for (struct rte_storage *ee = n->routes; ee; ee = ee->next) + if ((ee->rte.attrs->pref >= pref) && ee->rte.attrs->hostentry) { /* Recursive route should not depend on another recursive route */ log(L_WARN "Next hop address %I resolvable through recursive route for %N", @@ -2659,6 +2660,8 @@ rt_update_hostentry(rtable *tab, struct hostentry *he) goto done; } + pxlen = n->n.addr->pxlen; + if (a->dest == RTD_UNICAST) { for (struct nexthop *nh = &(a->nh); nh; nh = nh->next) From c70b3198dc349127273d202ab8c36afeebb6d9d0 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Mon, 27 Sep 2021 13:04:16 +0200 Subject: [PATCH 14/59] Route export is now asynchronous. To allow for multithreaded execution, we need to break the import-export chain and buffer the exports before actually processing them. --- conf/conf.h | 2 +- nest/config.Y | 2 +- nest/proto.c | 72 +++-- nest/route.h | 33 ++- nest/rt-table.c | 699 +++++++++++++++++++++++++++++++++++++++--------- 5 files changed, 667 insertions(+), 141 deletions(-) diff --git a/conf/conf.h b/conf/conf.h index 69ef8a10..4f6aa6eb 100644 --- a/conf/conf.h +++ b/conf/conf.h @@ -45,7 +45,7 @@ struct config { int cli_debug; /* Tracing of CLI connections and commands */ int latency_debug; /* I/O loop tracks duration of each event */ - int pipe_debug; /* Track route propagation through pipes */ + int table_debug; /* Track route propagation through tables */ u32 latency_limit; /* Events with longer duration are logged (us) */ u32 watchdog_warning; /* I/O loop watchdog limit for warning (us) */ u32 watchdog_timeout; /* Watchdog timeout (in seconds, 0 = disabled) */ diff --git a/nest/config.Y b/nest/config.Y index a56b25be..92f1aad2 100644 --- a/nest/config.Y +++ b/nest/config.Y @@ -348,7 +348,7 @@ debug_default: DEBUG PROTOCOLS debug_mask { new_config->proto_default_debug = $3; } | DEBUG CHANNELS debug_mask { new_config->channel_default_debug = $3; } | DEBUG COMMANDS expr { new_config->cli_debug = $3; } - | DEBUG PIPE bool { new_config->pipe_debug = $3; } + | DEBUG TABLES bool { new_config->table_debug = $3; } ; /* MRTDUMP PROTOCOLS is in systep/unix/config.Y */ diff --git a/nest/proto.c b/nest/proto.c index cf448fd9..fae0647a 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -649,7 +649,7 @@ channel_aux_import_stopped(struct rt_import_request *req) static void channel_aux_export_stopped(struct rt_export_request *req) { - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, push, req); + struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); req->hook = NULL; if (cat->refeed_pending && !cat->stop) @@ -669,7 +669,6 @@ channel_aux_stop(struct channel_aux_table *cat) rt_stop_import(&cat->push, channel_aux_import_stopped); rt_stop_export(&cat->get, channel_aux_export_stopped); - rt_lock_table(cat->tab); cat->tab->deleted = channel_aux_stopped; cat->tab->del_data = cat; rt_unlock_table(cat->tab); @@ -714,17 +713,51 @@ channel_get_log_state_change(struct rt_export_request *req, u8 state) void rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src *src); +static int +channel_aux_export_one_any(struct rt_export_request *req, struct rt_pending_export *rpe, rte **new, rte **old) +{ + struct rte_src *src = rpe->new ? rpe->new->rte.src : rpe->old->rte.src; + *old = RTES_OR_NULL(rpe->old); + struct rte_storage *new_stored; + + while (rpe) + { + new_stored = rpe->new; + rpe_mark_seen(req->hook, rpe); + rpe = rpe_next(rpe, src); + } + + *new = RTES_CLONE(new_stored, *new); + + return (*new || *old) && (&new_stored->rte != *old); +} + +static int +channel_aux_export_one_best(struct rt_export_request *req, struct rt_pending_export *rpe, rte **new, rte **old) +{ + *old = RTES_OR_NULL(rpe->old_best); + struct rte_storage *new_stored; + + while (rpe) + { + new_stored = rpe->new_best; + rpe_mark_seen(req->hook, rpe); + rpe = rpe_next(rpe, NULL); + } + + *new = RTES_CLONE(new_stored, *new); + + return (*new || *old) && (&new_stored->rte != *old); +} + static void channel_in_export_one_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) { struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - if (!rpe->new && !rpe->old) - return; - - rte n0; - struct rte_src *src = rpe->new ? rpe->new->rte.src : rpe->old->rte.src; - rte_update_direct(cat->c, net, RTES_CLONE(rpe->new, &n0), src); + rte n0, *new = &n0, *old; + if (channel_aux_export_one_any(req, rpe, &new, &old)) + rte_update_direct(cat->c, net, new, old ? old->src : new->src); } static void @@ -732,12 +765,9 @@ channel_in_export_one_best(struct rt_export_request *req, const net_addr *net, s { struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - if (!rpe->new && !rpe->old) - return; - - rte n0; - struct rte_src *src = rpe->old_best ? rpe->old_best->rte.src : rpe->new_best->rte.src; - rte_update_direct(cat->c, net, RTES_CLONE(rpe->new_best, &n0), src); + rte n0, *new = &n0, *old; + if (channel_aux_export_one_best(req, rpe, &new, &old)) + rte_update_direct(cat->c, net, new, old ? old->src : new->src); } static void @@ -768,16 +798,18 @@ static void channel_out_export_one_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) { struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - rte n0; - do_rt_notify_direct(cat->c, net, RTES_CLONE(rpe->new, &n0), RTES_OR_NULL(rpe->old)); + rte n0, *new = &n0, *old; + if (channel_aux_export_one_any(req, rpe, &new, &old)) + do_rt_notify_direct(cat->c, net, new, old); } static void channel_out_export_one_best(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) { struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - rte n0; - do_rt_notify_direct(cat->c, net, RTES_CLONE(rpe->new_best, &n0), RTES_OR_NULL(rpe->old_best)); + rte n0, *new = &n0, *old; + if (channel_aux_export_one_best(req, rpe, &new, &old)) + do_rt_notify_direct(cat->c, net, new, old); } static void @@ -831,6 +863,7 @@ channel_setup_in_table(struct channel *c, int best) c->in_table->c = c; c->in_table->tab = rt_setup(c->proto->pool, &cat->tab_cf); self_link(&c->in_table->tab->n); + rt_lock_table(c->in_table->tab); rt_request_import(c->in_table->tab, &c->in_table->push); rt_request_export(c->in_table->tab, &c->in_table->get); @@ -872,6 +905,7 @@ channel_setup_out_table(struct channel *c) c->out_table->c = c; c->out_table->tab = rt_setup(c->proto->pool, &cat->tab_cf); self_link(&c->out_table->tab->n); + rt_lock_table(c->out_table->tab); rt_request_import(c->out_table->tab, &c->out_table->push); rt_request_export(c->out_table->tab, &c->out_table->get); @@ -1051,6 +1085,8 @@ channel_request_table_feeding(struct channel *c) void channel_request_feeding(struct channel *c) { + CD(c, "Refeed requested"); + ASSERT(c->out_req.hook); if (c->out_table) diff --git a/nest/route.h b/nest/route.h index cb66be2a..d6474e09 100644 --- a/nest/route.h +++ b/nest/route.h @@ -15,6 +15,8 @@ #include "lib/resource.h" #include "lib/net.h" +#include + struct ea_list; struct protocol; struct proto; @@ -152,6 +154,7 @@ struct rtable_config { byte sorted; /* Routes of network are sorted according to rte_better() */ btime min_settle_time; /* Minimum settle time for notifications */ btime max_settle_time; /* Maximum settle time for notifications */ + btime export_settle_time; /* Delay before exports are announced */ }; typedef struct rtable { @@ -181,12 +184,20 @@ typedef struct rtable { byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */ byte hcu_scheduled; /* Hostcache update is scheduled */ byte nhu_state; /* Next Hop Update state */ + byte export_used; /* Export journal setup scheduled */ struct fib_iterator prune_fit; /* Rtable prune FIB iterator */ struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */ struct tbf rl_pipe; /* Rate limiting token buffer for pipe collisions */ list subscribers; /* Subscribers for notifications */ struct timer *settle_timer; /* Settle time for notifications */ + + list pending_exports; /* List of packed struct rt_pending_export */ + btime base_export_time; /* When first pending export was announced */ + struct timer *export_timer; + + struct rt_pending_export *first_export; /* First export to announce */ + u64 next_export_seq; /* The next export will have this ID */ } rtable; struct rt_subscription { @@ -203,6 +214,7 @@ struct rt_subscription { typedef struct network { struct rte_storage *routes; /* Available routes for this network */ + struct rt_pending_export *last, *first; /* Routes with unfinished exports */ struct fib_node n; /* FIB flags reserved for kernel syncer */ } net; @@ -294,6 +306,7 @@ struct rt_import_hook { u32 withdraws_accepted; /* Number of route withdraws accepted and processed */ } stats; + u64 flush_seq; /* Table export seq when the channel announced flushing */ btime last_state_change; /* Time of last state transition */ u8 import_state; /* IS_* */ @@ -306,7 +319,9 @@ struct rt_import_hook { }; struct rt_pending_export { + struct rt_pending_export * _Atomic next; /* Next export for the same destination */ struct rte_storage *new, *new_best, *old, *old_best; + u64 seq; /* Sequential ID (table-local) of the pending export */ }; struct rt_export_request { @@ -333,7 +348,6 @@ struct rt_export_hook { rtable *table; /* The connected table */ pool *pool; - linpool *lp; struct rt_export_request *req; /* The requestor */ @@ -345,10 +359,15 @@ struct rt_export_hook { struct fib_iterator feed_fit; /* Routing table iterator used during feeding */ + struct bmap seq_map; /* Keep track which exports were already procesed */ + + struct rt_pending_export * _Atomic last_export;/* Last export processed */ + struct rt_pending_export *rpe_next; /* Next pending export to process */ + btime last_state_change; /* Time of last state transition */ u8 refeed_pending; /* Refeeding and another refeed is scheduled */ - u8 export_state; /* Route export state (TES_*, see below) */ + _Atomic u8 export_state; /* Route export state (TES_*, see below) */ struct event *event; /* Event running all the export operations */ @@ -384,6 +403,16 @@ static inline u8 rt_export_get_state(struct rt_export_hook *eh) { return eh ? eh void rte_import(struct rt_import_request *req, const net_addr *net, rte *new, struct rte_src *src); +/* Get next rpe. If src is given, it must match. */ +struct rt_pending_export *rpe_next(struct rt_pending_export *rpe, struct rte_src *src); + +/* Mark the pending export processed */ +void rpe_mark_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe); + +/* Get pending export seen status */ +int rpe_get_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe); + + /* Types of route announcement, also used as flags */ #define RA_UNDEF 0 /* Undefined RA type */ #define RA_OPTIMAL 1 /* Announcement of optimal route change */ diff --git a/nest/rt-table.c b/nest/rt-table.c index 2dcb2e26..c049101a 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -45,12 +45,24 @@ #include "lib/string.h" #include "lib/alloca.h" +#include + pool *rt_table_pool; static linpool *rte_update_pool; list routing_tables; +/* Data structures for export journal */ +#define RT_PENDING_EXPORT_ITEMS (page_size - sizeof(struct rt_export_block)) / sizeof(struct rt_pending_export) + +struct rt_export_block { + node n; + _Atomic u32 end; + _Atomic _Bool not_last; + struct rt_pending_export export[]; +}; + static void rt_free_hostcache(rtable *tab); static void rt_notify_hostcache(rtable *tab, net *net); static void rt_update_hostcache(rtable *tab); @@ -59,6 +71,12 @@ static inline void rt_prune_table(rtable *tab); static inline void rt_schedule_notify(rtable *tab); static void rt_feed_channel(void *); +static inline void rt_export_used(rtable *tab); +static void rt_export_cleanup(rtable *tab); + +static inline void rte_update_lock(void); +static inline void rte_update_unlock(void); + const char *rt_import_state_name_array[TIS_MAX] = { [TIS_DOWN] = "DOWN", [TIS_UP] = "UP", @@ -385,9 +403,9 @@ rte_mergable(rte *pri, rte *sec) static void rte_trace(const char *name, const rte *e, int dir, const char *msg) { - log(L_TRACE "%s %c %s %N src %uL %uG %uS %s%s", + log(L_TRACE "%s %c %s %N src %uL %uG %uS id %u %s%s", name, dir, msg, e->net, - e->src->private_id, e->src->global_id, e->stale_cycle, + e->src->private_id, e->src->global_id, e->stale_cycle, e->id, rta_dest_name(e->attrs->dest), rte_is_filtered(e) ? " (filtered)" : ""); } @@ -583,6 +601,16 @@ rt_notify_basic(struct channel *c, const net_addr *net, rte *new, rte *old) do_rt_notify(c, net, new, old); } +static void +channel_rpe_mark_seen(struct rt_export_request *req, struct rt_pending_export *rpe) +{ + struct channel *c = SKIP_BACK(struct channel, out_req, req); + + rpe_mark_seen(req->hook, rpe); + if (rpe->old) + bmap_clear(&c->export_reject_map, rpe->old->rte.id); +} + void rt_notify_accepted(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe, struct rte **feed, uint count) @@ -626,38 +654,30 @@ rt_notify_accepted(struct rt_export_request *req, const net_addr *n, struct rt_p } } +done: /* Check obsolete routes for previously exported */ - if (!old_best) - if (rpe && rpe->old && bmap_test(&c->export_map, rpe->old->rte.id)) - old_best = &rpe->old->rte; - -/* for (; rpe; rpe = atomic_load_explicit(&rpe->next, memory_order_relaxed)) + while (rpe) + { + channel_rpe_mark_seen(req, rpe); + if (rpe->old) { - if (rpe->old && bmap_test(&hook->accept_map, rpe->old->id)) + if (bmap_test(&c->export_map, rpe->old->rte.id)) { - old_best = &rpe->old.rte; - break; + ASSERT_DIE(old_best == NULL); + old_best = &rpe->old->rte; } - - if (rpe == rpe_last) - break; } - */ + rpe = rpe_next(rpe, NULL); + } /* Nothing to export */ if (!new_best && !old_best) { DBG("rt_notify_accepted: nothing to export\n"); - goto done; + return; } do_rt_notify(c, n, new_best, old_best); - -done: - /* Drop the old stored rejection if applicable. - * new->id == old->id happens when updating hostentries. */ - if (rpe && rpe->old && (!rpe->new || (rpe->new->rte.id != rpe->old->rte.id))) - bmap_clear(&c->export_reject_map, rpe->old->rte.id); } @@ -750,63 +770,70 @@ rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pen } /* Check obsolete routes for previously exported */ - if (!old_best) - if (rpe && rpe->old && bmap_test(&c->export_map, rpe->old->rte.id)) - old_best = &rpe->old->rte; - -/* for (; rpe; rpe = atomic_load_explicit(&rpe->next, memory_order_relaxed)) + while (rpe) + { + channel_rpe_mark_seen(req, rpe); + if (rpe->old) { - if (rpe->old && bmap_test(&hook->accept_map, rpe->old->id)) + if (bmap_test(&c->export_map, rpe->old->rte.id)) { - old_best = &rpe->old.rte; - break; + ASSERT_DIE(old_best == NULL); + old_best = &rpe->old->rte; } - - if (rpe == rpe_last) - break; } - */ + rpe = rpe_next(rpe, NULL); + } /* Prepare new merged route */ rte *new_merged = count ? rt_export_merged(c, feed, count, rte_update_pool, 0) : NULL; if (new_merged || old_best) do_rt_notify(c, n, new_merged, old_best); - - /* Drop the old stored rejection if applicable. - * new->id == old->id happens when updating hostentries. */ - if (rpe && rpe->old && (!rpe->new || (rpe->new->rte.id != rpe->old->rte.id))) - bmap_clear(&c->export_reject_map, rpe->old->rte.id); } void rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte n0; - if (rpe->new_best != rpe->old_best) - rt_notify_basic(c, net, RTES_CLONE(rpe->new_best, &n0), RTES_OR_NULL(rpe->old_best)); + rte *old = RTES_OR_NULL(rpe->old_best); + struct rte_storage *new_best = rpe->new_best; - /* Drop the old stored rejection if applicable. - * new->id == old->id happens when updating hostentries. */ - if (rpe->old && (!rpe->new || (rpe->new->rte.id != rpe->old->rte.id))) - bmap_clear(&c->export_reject_map, rpe->old->rte.id); + while (rpe) + { + channel_rpe_mark_seen(req, rpe); + new_best = rpe->new_best; + rpe = rpe_next(rpe, NULL); + } + + if (&new_best->rte != old) + { + rte n0, *new = RTES_CLONE(new_best, &n0); + rt_notify_basic(c, net, new, old); + } } void rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte n0; - if (rpe->new != rpe->old) - rt_notify_basic(c, net, RTES_CLONE(rpe->new, &n0), RTES_OR_NULL(rpe->old)); + struct rte_src *src = rpe->new ? rpe->new->rte.src : rpe->old->rte.src; + rte *old = RTES_OR_NULL(rpe->old); + struct rte_storage *new_any = rpe->new; - /* Drop the old stored rejection if applicable. - * new->id == old->id happens when updating hostentries. */ - if (rpe->old && (!rpe->new || (rpe->new->rte.id != rpe->old->rte.id))) - bmap_clear(&c->export_reject_map, rpe->old->rte.id); + while (rpe) + { + channel_rpe_mark_seen(req, rpe); + new_any = rpe->new; + rpe = rpe_next(rpe, src); + } + + if (&new_any->rte != old) + { + rte n0, *new = RTES_CLONE(new_any, &n0); + rt_notify_basic(c, net, new, old); + } } void @@ -821,6 +848,76 @@ rt_feed_any(struct rt_export_request *req, const net_addr *net, struct rt_pendin } } +void +rpe_mark_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe) +{ + bmap_set(&hook->seq_map, rpe->seq); +} + +struct rt_pending_export * +rpe_next(struct rt_pending_export *rpe, struct rte_src *src) +{ + struct rt_pending_export *next = atomic_load_explicit(&rpe->next, memory_order_acquire); + + if (!next) + return NULL; + + if (!src) + return next; + + while (rpe = next) + if (src == (rpe->new ? rpe->new->rte.src : rpe->old->rte.src)) + return rpe; + else + next = atomic_load_explicit(&rpe->next, memory_order_acquire); + + return NULL; +} + +static struct rt_pending_export * rt_next_export_fast(struct rt_pending_export *last); +static void +rte_export(struct rt_export_hook *hook, struct rt_pending_export *rpe) +{ + if (bmap_test(&hook->seq_map, rpe->seq)) + goto seen; + + const net_addr *n = rpe->new_best ? rpe->new_best->rte.net : rpe->old_best->rte.net; + + if (rpe->new) + hook->stats.updates_received++; + else + hook->stats.withdraws_received++; + + if (hook->req->export_one) + hook->req->export_one(hook->req, n, rpe); + else if (hook->req->export_bulk) + { + net *net = SKIP_BACK(struct network, n.addr, (net_addr (*)[0]) n); + uint count = rte_feed_count(net); + rte **feed = NULL; + if (count) + { + feed = alloca(count * sizeof(rte *)); + rte_feed_obtain(net, feed, count); + } + hook->req->export_bulk(hook->req, n, rpe, feed, count); + } + else + bug("Export request must always provide an export method"); + +seen: + /* Get the next export if exists */ + hook->rpe_next = rt_next_export_fast(rpe); + + /* The last block may be available to free */ + if (PAGE_HEAD(hook->rpe_next) != PAGE_HEAD(rpe)) + rt_export_used(hook->table); + + /* Releasing this export for cleanup routine */ + DBG("store hook=%p last_export=%p seq=%lu\n", hook, rpe, rpe->seq); + atomic_store_explicit(&hook->last_export, rpe, memory_order_release); +} + /** * rte_announce - announce a routing table change * @tab: table the route has been added to @@ -856,19 +953,23 @@ static void rte_announce(rtable *tab, net *net, struct rte_storage *new, struct rte_storage *old, struct rte_storage *new_best, struct rte_storage *old_best) { - if (!new || !rte_is_valid(&new->rte)) - new = NULL; - - if (!old || !rte_is_valid(&old->rte)) - old = NULL; - if (!new_best || !rte_is_valid(&new_best->rte)) new_best = NULL; if (!old_best || !rte_is_valid(&old_best->rte)) old_best = NULL; - if (!new && !old && !new_best && !old_best) + if (!new || !rte_is_valid(&new->rte)) + new = NULL; + + if (old && !rte_is_valid(&old->rte)) + { + /* Filtered old route isn't announced, should be freed immediately. */ + rte_free(old, tab); + old = NULL; + } + + if ((new == old) && (new_best == old_best)) return; if (new_best != old_best) @@ -884,35 +985,194 @@ rte_announce(rtable *tab, net *net, struct rte_storage *new, struct rte_storage rt_schedule_notify(tab); - struct rt_pending_export rpe = { .new = new, .old = old, .new_best = new_best, .old_best = old_best }; - uint count = rte_feed_count(net); - rte **feed = NULL; - if (count) + if (EMPTY_LIST(tab->exports) && EMPTY_LIST(tab->pending_exports)) { - feed = alloca(count * sizeof(rte *)); - rte_feed_obtain(net, feed, count); + /* No export hook and no pending exports to cleanup. We may free the route immediately. */ + if (!old) + return; + + hmap_clear(&tab->id_map, old->rte.id); + rte_free(old, tab); + return; } - struct rt_export_hook *eh; - WALK_LIST(eh, tab->exports) + /* Get the pending export structure */ + struct rt_export_block *rpeb = NULL, *rpebsnl = NULL; + u32 end = 0; + + if (!EMPTY_LIST(tab->pending_exports)) { - if (eh->export_state == TES_STOP) + rpeb = TAIL(tab->pending_exports); + end = atomic_load_explicit(&rpeb->end, memory_order_relaxed); + if (end >= RT_PENDING_EXPORT_ITEMS) + { + ASSERT_DIE(end == RT_PENDING_EXPORT_ITEMS); + rpebsnl = rpeb; + + rpeb = NULL; + end = 0; + } + } + + if (!rpeb) + { + rpeb = alloc_page(tab->rp); + *rpeb = (struct rt_export_block) {}; + add_tail(&tab->pending_exports, &rpeb->n); + } + + /* Fill the pending export */ + struct rt_pending_export *rpe = &rpeb->export[rpeb->end]; + *rpe = (struct rt_pending_export) { + .new = new, + .new_best = new_best, + .old = old, + .old_best = old_best, + .seq = tab->next_export_seq++, + }; + + DBG("rte_announce: table=%s net=%N new=%p from %p old=%p from %p new_best=%p old_best=%p seq=%lu\n", tab->name, net->n.addr, new, new ? new->sender : NULL, old, old ? old->sender : NULL, new_best, old_best, rpe->seq); + + ASSERT_DIE(atomic_fetch_add_explicit(&rpeb->end, 1, memory_order_release) == end); + + if (rpebsnl) + { + _Bool f = 0; + ASSERT_DIE(atomic_compare_exchange_strong_explicit(&rpebsnl->not_last, &f, 1, + memory_order_release, memory_order_relaxed)); + } + + /* Append to the same-network squasher list */ + if (net->last) + { + struct rt_pending_export *rpenull = NULL; + ASSERT_DIE(atomic_compare_exchange_strong_explicit( + &net->last->next, &rpenull, rpe, + memory_order_relaxed, + memory_order_relaxed)); + + } + + net->last = rpe; + + if (!net->first) + net->first = rpe; + + if (tab->first_export == NULL) + tab->first_export = rpe; + + if (!tm_active(tab->export_timer)) + tm_start(tab->export_timer, tab->config->export_settle_time); +} + +static struct rt_pending_export * +rt_next_export_fast(struct rt_pending_export *last) +{ + /* Get the whole export block and find our position in there. */ + struct rt_export_block *rpeb = PAGE_HEAD(last); + u32 pos = (last - &rpeb->export[0]); + u32 end = atomic_load_explicit(&rpeb->end, memory_order_acquire); + ASSERT_DIE(pos < end); + + /* Next is in the same block. */ + if (++pos < end) + return &rpeb->export[pos]; + + /* There is another block. */ + if (atomic_load_explicit(&rpeb->not_last, memory_order_acquire)) + { + /* This is OK to do non-atomically because of the not_last flag. */ + rpeb = NODE_NEXT(rpeb); + return &rpeb->export[0]; + } + + /* There is nothing more. */ + return NULL; +} + +static struct rt_pending_export * +rt_next_export(struct rt_export_hook *hook, rtable *tab) +{ + /* As the table is locked, it is safe to reload the last export pointer */ + struct rt_pending_export *last = atomic_load_explicit(&hook->last_export, memory_order_acquire); + + /* It is still valid, let's reuse it */ + if (last) + return rt_next_export_fast(last); + + /* No, therefore we must process the table's first pending export */ + else + return tab->first_export; +} + +static void +rt_announce_exports(timer *tm) +{ + rtable *tab = tm->data; + + struct rt_export_hook *c; node *n; + WALK_LIST2(c, n, tab->exports, n) + { + if (atomic_load_explicit(&c->export_state, memory_order_acquire) != TES_READY) continue; - if (new) - eh->stats.updates_received++; - else - eh->stats.withdraws_received++; - - if (eh->req->export_one) - eh->req->export_one(eh->req, net->n.addr, &rpe); - else if (eh->req->export_bulk) - eh->req->export_bulk(eh->req, net->n.addr, &rpe, feed, count); - else - bug("Export request must always provide an export method"); + ev_schedule_work(c->event); } } +static struct rt_pending_export * +rt_last_export(rtable *tab) +{ + struct rt_pending_export *rpe = NULL; + + if (!EMPTY_LIST(tab->pending_exports)) + { + /* We'll continue processing exports from this export on */ + struct rt_export_block *reb = TAIL(tab->pending_exports); + ASSERT_DIE(reb->end); + rpe = &reb->export[reb->end - 1]; + } + + return rpe; +} + +#define RT_EXPORT_BULK 1024 + +static void +rt_export_hook(void *_data) +{ + struct rt_export_hook *c = _data; + + ASSERT_DIE(atomic_load_explicit(&c->export_state, memory_order_relaxed) == TES_READY); + + if (!c->rpe_next) + { + c->rpe_next = rt_next_export(c, c->table); + + if (!c->rpe_next) + { + rt_export_used(c->table); + return; + } + } + + /* Process the export */ + for (uint i=0; irpe_next); + + if (!c->rpe_next) + break; + + rte_update_unlock(); + } + + ev_schedule_work(c->event); +} + + static inline int rte_validate(struct channel *ch, rte *e) { @@ -1133,14 +1393,8 @@ rte_recalculate(struct rt_import_hook *c, net *net, rte *new, struct rte_src *sr if (new_stored) { new_stored->rte.lastmod = current_time(); - - if (!old) - { - new_stored->rte.id = hmap_first_zero(&table->id_map); - hmap_set(&table->id_map, new_stored->rte.id); - } - else - new_stored->rte.id = old->id; + new_stored->rte.id = hmap_first_zero(&table->id_map); + hmap_set(&table->id_map, new_stored->rte.id); } _Bool nb = (new_stored == net->routes); @@ -1178,13 +1432,6 @@ rte_recalculate(struct rt_import_hook *c, net *net, rte *new, struct rte_src *sr p->rte_insert(net, &new_stored->rte); #endif - if (old) - { - if (!new_stored) - hmap_clear(&table->id_map, old->id); - - rte_free(old_stored, table); - } } static int rte_update_nest_cnt; /* Nesting counter to allow recursive updates */ @@ -1384,13 +1631,16 @@ rt_export_stopped(void *data) struct rt_export_hook *hook = data; rtable *tab = hook->table; + /* Drop pending exports */ + rt_export_used(tab); + /* Unlist */ rem_node(&hook->n); - /* Reporting the channel as stopped. */ + /* Report the channel as stopped. */ hook->stopped(hook->req); - /* Freeing the hook together with its coroutine. */ + /* Free the hook together with its coroutine. */ rfree(hook->pool); rt_unlock_table(tab); @@ -1412,7 +1662,7 @@ static inline void rt_set_export_state(struct rt_export_hook *hook, u8 state) { hook->last_state_change = current_time(); - hook->export_state = state; + atomic_store_explicit(&hook->export_state, state, memory_order_release); if (hook->req->log_state_change) hook->req->log_state_change(hook->req, state); @@ -1460,15 +1710,20 @@ rt_request_export(rtable *tab, struct rt_export_request *req) pool *p = rp_new(tab->rp, "Export hook"); struct rt_export_hook *hook = req->hook = mb_allocz(p, sizeof(struct rt_export_hook)); hook->pool = p; - hook->lp = lp_new_default(p); hook->req = req; hook->table = tab; /* stats zeroed by mb_allocz */ + bmap_init(&hook->seq_map, p, 1024); + rt_set_export_state(hook, TES_HUNGRY); + struct rt_pending_export *rpe = rt_last_export(hook->table); + DBG("store hook=%p last_export=%p seq=%lu\n", hook, rpe, rpe ? rpe->seq : 0); + atomic_store_explicit(&hook->last_export, rpe, memory_order_relaxed); + hook->n = (node) {}; add_tail(&tab->exports, &hook->n); @@ -1476,10 +1731,10 @@ rt_request_export(rtable *tab, struct rt_export_request *req) DBG("New export hook %p req %p in table %s uc=%u\n", hook, req, tab->name, tab->use_count); - rt_set_export_state(hook, TES_FEEDING); - hook->event = ev_new_init(p, rt_feed_channel, hook); ev_schedule_work(hook->event); + + rt_set_export_state(hook, TES_FEEDING); } void @@ -1493,16 +1748,18 @@ rt_stop_export(struct rt_export_request *req, void (*stopped)(struct rt_export_r /* Stop feeding */ ev_postpone(hook->event); - if (hook->export_state == TES_FEEDING) + if (atomic_load_explicit(&hook->export_state, memory_order_relaxed) == TES_FEEDING) fit_get(&tab->fib, &hook->feed_fit); hook->event->hook = rt_export_stopped; hook->stopped = stopped; - rt_set_export_state(hook, TES_STOP); ev_schedule(hook->event); + + rt_set_export_state(hook, TES_STOP); } + /** * rt_refresh_begin - start a refresh cycle * @t: related routing table @@ -1649,8 +1906,8 @@ rt_dump_hooks(rtable *tab) { eh->req->dump_req(eh->req); debug(" Export hook %p requested by %p:" - " refeed_pending=%u last_state_change=%t export_state=%u stopped=%p\n", - eh, eh->req, eh->refeed_pending, eh->last_state_change, eh->export_state, eh->stopped); + " refeed_pending=%u last_state_change=%t export_state=%u\n", + eh, eh->req, eh->refeed_pending, eh->last_state_change, atomic_load_explicit(&eh->export_state, memory_order_relaxed)); } debug("\n"); } @@ -1700,6 +1957,18 @@ rt_schedule_prune(rtable *tab) tab->prune_state |= 1; } +void +rt_export_used(rtable *tab) +{ + if (config->table_debug) + log(L_TRACE "%s: Export cleanup requested", tab->name); + + if (tab->export_used) + return; + + tab->export_used = 1; + ev_schedule(tab->rt_event); +} static void rt_event(void *ptr) @@ -1708,6 +1977,9 @@ rt_event(void *ptr) rt_lock_table(tab); + if (tab->export_used) + rt_export_cleanup(tab); + if (tab->hcu_scheduled) rt_update_hostcache(tab); @@ -1857,13 +2129,17 @@ rt_setup(pool *pp, struct rtable_config *cf) init_list(&t->imports); init_list(&t->exports); + hmap_init(&t->id_map, p, 1024); hmap_set(&t->id_map, 0); + init_list(&t->pending_exports); init_list(&t->subscribers); t->rt_event = ev_new_init(p, rt_event, t); + t->export_timer = tm_new_init(p, rt_announce_exports, t, 0, 0); t->last_rt_change = t->gc_time = current_time(); + t->next_export_seq = 1; t->rl_pipe = (struct tbf) TBF_DEFAULT_LOG_LIMITS; @@ -1961,7 +2237,7 @@ again: } } - if (!n->routes) /* Orphaned FIB entry */ + if (!n->routes && !n->first) /* Orphaned FIB entry */ { FIB_ITERATE_PUT(fit); fib_delete(&tab->fib, n); @@ -1982,15 +2258,15 @@ again: rt_prune_sources(); + uint flushed_channels = 0; + /* Close flushed channels */ WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n) if (ih->import_state == TIS_FLUSHING) { - rt_set_import_state(ih, TIS_CLEARED); - ih->stopped(ih->req); - rem_node(&ih->n); - mb_free(ih); - rt_unlock_table(tab); + ih->flush_seq = tab->next_export_seq; + rt_set_import_state(ih, TIS_WAITING); + flushed_channels++; } else if (ih->stale_pruning != ih->stale_pruned) { @@ -1999,6 +2275,185 @@ again: if (ih->req->trace_routes & D_STATES) log(L_TRACE "%s: table prune after refresh end [%u]", ih->req->name, ih->stale_pruned); } + + /* In some cases, we may want to directly proceed to export cleanup */ + if (EMPTY_LIST(tab->exports) && flushed_channels) + rt_export_cleanup(tab); +} + +static void +rt_export_cleanup(rtable *tab) +{ + tab->export_used = 0; + + u64 min_seq = ~((u64) 0); + struct rt_pending_export *last_export_to_free = NULL; + struct rt_pending_export *first_export = tab->first_export; + + struct rt_export_hook *eh; + node *n; + WALK_LIST2(eh, n, tab->exports, n) + { + switch (atomic_load_explicit(&eh->export_state, memory_order_acquire)) + { + case TES_DOWN: + case TES_HUNGRY: + continue; + + case TES_READY: + { + struct rt_pending_export *last = atomic_load_explicit(&eh->last_export, memory_order_acquire); + if (!last) + /* No last export means that the channel has exported nothing since last cleanup */ + goto done; + + else if (min_seq > last->seq) + { + min_seq = last->seq; + last_export_to_free = last; + } + continue; + } + + default: + /* It's only safe to cleanup when the export state is idle or regular. No feeding or stopping allowed. */ + goto done; + } + } + + tab->first_export = last_export_to_free ? rt_next_export_fast(last_export_to_free) : NULL; + + if (config->table_debug) + log(L_TRACE "%s: Export cleanup, old first_export seq %lu, new %lu, min_seq %ld", + tab->name, + first_export ? first_export->seq : 0, + tab->first_export ? tab->first_export->seq : 0, + min_seq); + + WALK_LIST2(eh, n, tab->exports, n) + { + if (atomic_load_explicit(&eh->export_state, memory_order_acquire) != TES_READY) + continue; + + struct rt_pending_export *last = atomic_load_explicit(&eh->last_export, memory_order_acquire); + if (last == last_export_to_free) + { + /* This may fail when the channel managed to export more inbetween. This is OK. */ + atomic_compare_exchange_strong_explicit( + &eh->last_export, &last, NULL, + memory_order_release, + memory_order_relaxed); + + DBG("store hook=%p last_export=NULL\n", eh); + } + } + + while (first_export && (first_export->seq <= min_seq)) + { + ASSERT_DIE(first_export->new || first_export->old); + + const net_addr *n = first_export->new ? + first_export->new->rte.net : + first_export->old->rte.net; + net *net = SKIP_BACK(struct network, n.addr, (net_addr (*)[0]) n); + + ASSERT_DIE(net->first == first_export); + + if (first_export == net->last) + /* The only export here */ + net->last = net->first = NULL; + else + /* First is now the next one */ + net->first = atomic_load_explicit(&first_export->next, memory_order_relaxed); + + /* For now, the old route may be finally freed */ + if (first_export->old) + { + rt_rte_trace_in(D_ROUTES, first_export->old->rte.sender->req, &first_export->old->rte, "freed"); + hmap_clear(&tab->id_map, first_export->old->rte.id); + rte_free(first_export->old, tab); + } + +#ifdef LOCAL_DEBUG + memset(first_export, 0xbd, sizeof(struct rt_pending_export)); +#endif + + struct rt_export_block *reb = HEAD(tab->pending_exports); + ASSERT_DIE(reb == PAGE_HEAD(first_export)); + + u32 pos = (first_export - &reb->export[0]); + u32 end = atomic_load_explicit(&reb->end, memory_order_relaxed); + ASSERT_DIE(pos < end); + + struct rt_pending_export *next = NULL; + + if (++pos < end) + next = &reb->export[pos]; + else + { + rem_node(&reb->n); + +#ifdef LOCAL_DEBUG + memset(reb, 0xbe, page_size); +#endif + + free_page(tab->rp, reb); + + if (EMPTY_LIST(tab->pending_exports)) + { + if (config->table_debug) + log(L_TRACE "%s: Resetting export seq", tab->name); + + node *n; + WALK_LIST2(eh, n, tab->exports, n) + { + if (atomic_load_explicit(&eh->export_state, memory_order_acquire) != TES_READY) + continue; + + ASSERT_DIE(atomic_load_explicit(&eh->last_export, memory_order_acquire) == NULL); + bmap_reset(&eh->seq_map, 1024); + } + + tab->next_export_seq = 1; + } + else + { + reb = HEAD(tab->pending_exports); + next = &reb->export[0]; + } + } + + first_export = next; + } + +done:; + struct rt_import_hook *ih; node *x; + _Bool imports_stopped = 0; + WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n) + if (ih->import_state == TIS_WAITING) + if (!first_export || (first_export->seq >= ih->flush_seq)) + { + ih->import_state = TIS_CLEARED; + ih->stopped(ih->req); + rem_node(&ih->n); + mb_free(ih); + rt_unlock_table(tab); + imports_stopped = 1; + } + + if (tab->export_used) + ev_schedule(tab->rt_event); + + if (imports_stopped) + { + if (config->table_debug) + log(L_TRACE "%s: Sources pruning routine requested", tab->name); + + rt_prune_sources(); + } + + if (EMPTY_LIST(tab->pending_exports) && tm_active(tab->export_timer)) + tm_stop(tab->export_timer); } void @@ -2180,6 +2635,11 @@ rt_next_hop_update_net(rtable *tab, net *n) /* Replace the route in the list */ new->next = e->next; *k = e = new; + + /* Get a new ID for the route */ + new->rte.lastmod = current_time(); + new->rte.id = hmap_first_zero(&tab->id_map); + hmap_set(&tab->id_map, new->rte.id); } ASSERT_DIE(pos == count); @@ -2213,9 +2673,6 @@ rt_next_hop_update_net(rtable *tab, net *n) rte_announce_i(tab, n, updates[i].new, updates[i].old, new, old_best); } - for (int i=0; ifeed_fit; int max_feed = 256; - ASSERT(c->export_state == TES_FEEDING); + ASSERT(atomic_load_explicit(&c->export_state, memory_order_relaxed) == TES_FEEDING); FIB_ITERATE_START(&c->table->fib, fit, net, n) { @@ -2416,8 +2873,8 @@ rt_feed_channel(void *data) return; } - if (c->export_state != TES_FEEDING) - goto done; + if (atomic_load_explicit(&c->export_state, memory_order_acquire) != TES_FEEDING) + return; if (c->req->export_bulk) { @@ -2427,8 +2884,7 @@ rt_feed_channel(void *data) rte_update_lock(); rte **feed = alloca(count * sizeof(rte *)); rte_feed_obtain(n, feed, count); - struct rt_pending_export rpe = { .new_best = n->routes }; - c->req->export_bulk(c->req, n->n.addr, &rpe, feed, count); + c->req->export_bulk(c->req, n->n.addr, NULL, feed, count); max_feed -= count; rte_update_unlock(); } @@ -2441,10 +2897,15 @@ rt_feed_channel(void *data) max_feed--; rte_update_unlock(); } + + for (struct rt_pending_export *rpe = n->first; rpe; rpe = rpe_next(rpe, NULL)) + rpe_mark_seen(c, rpe); } FIB_ITERATE_END; -done: + c->event->hook = rt_export_hook; + ev_schedule_work(c->event); + rt_set_export_state(c, TES_READY); } From a845651bc50b75b2be41b4427e04857ce3c106a6 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Mon, 15 Feb 2021 18:23:15 +0100 Subject: [PATCH 15/59] Multithreaded BIRD needs reasonably new software to compile --- .gitlab-ci.yml | 25 ------------------------- INSTALL | 4 ++-- 2 files changed, 2 insertions(+), 27 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 65a0a05b..39202098 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -134,21 +134,11 @@ docker_fedora-34-amd64: IMG_NAME: "fedora-34-amd64" <<: *docker_build -docker_centos-7-amd64: - variables: - IMG_NAME: "centos-7-amd64" - <<: *docker_build - docker_centos-8-amd64: variables: IMG_NAME: "centos-8-amd64" <<: *docker_build -docker_ubuntu-14_04-amd64: - variables: - IMG_NAME: "ubuntu-14.04-amd64" - <<: *docker_build - docker_ubuntu-16_04-amd64: variables: IMG_NAME: "ubuntu-16.04-amd64" @@ -312,18 +302,10 @@ build-fedora-34-amd64: <<: *build-linux image: registry.labs.nic.cz/labs/bird:fedora-33-amd64 -build-centos-7-amd64: - <<: *build-linux - image: registry.labs.nic.cz/labs/bird:centos-7-amd64 - build-centos-8-amd64: <<: *build-linux image: registry.labs.nic.cz/labs/bird:centos-8-amd64 -build-ubuntu-14_04-amd64: - <<: *build-linux - image: registry.labs.nic.cz/labs/bird:ubuntu-14.04-amd64 - build-ubuntu-16_04-amd64: <<: *build-linux image: registry.labs.nic.cz/labs/bird:ubuntu-16.04-amd64 @@ -468,13 +450,6 @@ pkg-fedora-34-amd64: needs: [build-fedora-34-amd64] image: registry.labs.nic.cz/labs/bird:fedora-34-amd64 -pkg-centos-7-amd64: - <<: *pkg-rpm-wa - variables: - LC_ALL: en_US.UTF-8 - needs: [build-centos-7-amd64] - image: registry.labs.nic.cz/labs/bird:centos-7-amd64 - pkg-centos-8-amd64: <<: *pkg-rpm-wa needs: [build-centos-8-amd64] diff --git a/INSTALL b/INSTALL index 7a179284..c1094ee5 100644 --- a/INSTALL +++ b/INSTALL @@ -27,9 +27,9 @@ Requirements For compiling BIRD you need these programs and libraries: - - GNU C Compiler (or LLVM Clang) + - GNU C Compiler (or LLVM Clang) capable of compiling C11 code - GNU Make - - GNU Bison + - GNU Bison (at least 3.0) - GNU M4 - Flex From a4451535c69b8f934523905a8131ae2f16be2146 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Wed, 4 Aug 2021 22:48:51 +0200 Subject: [PATCH 16/59] Unified time for whole BIRD In previous versions, every thread used its own time structures, effectively leading to different time in every thread and strange logging messages. The time processing code now uses global atomic variables to keep current time available for fast concurrent reading and safe updates. --- lib/timer.c | 29 +++++---------- lib/timer.h | 14 ++++---- proto/bfd/io.c | 8 ++--- sysdep/unix/io.c | 90 +++++++++++++++++++--------------------------- sysdep/unix/main.c | 1 + 5 files changed, 56 insertions(+), 86 deletions(-) diff --git a/lib/timer.c b/lib/timer.c index 6efcadb4..ff1fb5ef 100644 --- a/lib/timer.c +++ b/lib/timer.c @@ -32,6 +32,7 @@ #include "nest/bird.h" +#include "lib/coro.h" #include "lib/heap.h" #include "lib/resource.h" #include "lib/timer.h" @@ -45,23 +46,11 @@ struct timeloop main_timeloop; /* Data accessed and modified from proto/bfd/io.c */ _Thread_local struct timeloop *local_timeloop; +_Atomic btime last_time; +_Atomic btime real_time; + void wakeup_kick_current(void); -btime -current_time(void) -{ - return local_timeloop->last_time; -} - -btime -current_real_time(void) -{ - if (!local_timeloop->real_time) - times_update_real_time(local_timeloop); - - return local_timeloop->real_time; -} - #define TIMER_LESS(a,b) ((a)->expires < (b)->expires) #define TIMER_SWAP(heap,a,b,t) (t = heap[a], heap[a] = heap[b], heap[b] = t, \ @@ -164,8 +153,6 @@ tm_stop(timer *t) void timers_init(struct timeloop *loop, pool *p) { - times_init(loop); - BUFFER_INIT(loop->timers, p, 4); BUFFER_PUSH(loop->timers) = NULL; } @@ -178,8 +165,8 @@ timers_fire(struct timeloop *loop) btime base_time; timer *t; - times_update(loop); - base_time = loop->last_time; + times_update(); + base_time = current_time(); while (t = timers_first(loop)) { @@ -190,8 +177,8 @@ timers_fire(struct timeloop *loop) { btime when = t->expires + t->recurrent; - if (when <= loop->last_time) - when = loop->last_time + t->recurrent; + if (when <= base_time) + when = base_time + t->recurrent; if (t->randomize) when += random() % (t->randomize + 1); diff --git a/lib/timer.h b/lib/timer.h index bc568ee6..b201b8c8 100644 --- a/lib/timer.h +++ b/lib/timer.h @@ -14,6 +14,10 @@ #include "lib/buffer.h" #include "lib/resource.h" +#include + +extern _Atomic btime last_time; +extern _Atomic btime real_time; typedef struct timer { @@ -31,8 +35,6 @@ typedef struct timer struct timeloop { BUFFER_(timer *) timers; - btime last_time; - btime real_time; }; static inline uint timers_count(struct timeloop *loop) @@ -44,8 +46,8 @@ static inline timer *timers_first(struct timeloop *loop) extern struct timeloop main_timeloop; extern _Thread_local struct timeloop *local_timeloop; -btime current_time(void); -btime current_real_time(void); +#define current_time() atomic_load_explicit(&last_time, memory_order_acquire) +#define current_real_time() atomic_load_explicit(&real_time, memory_order_acquire) //#define now (current_time() TO_S) //#define now_real (current_real_time() TO_S) @@ -95,9 +97,7 @@ tm_start_max(timer *t, btime after) } /* In sysdep code */ -void times_init(struct timeloop *loop); -void times_update(struct timeloop *loop); -void times_update_real_time(struct timeloop *loop); +void times_update(void); /* For I/O loop */ void timers_init(struct timeloop *loop, pool *p); diff --git a/proto/bfd/io.c b/proto/bfd/io.c index 8fdc84fb..c5f1e024 100644 --- a/proto/bfd/io.c +++ b/proto/bfd/io.c @@ -172,7 +172,7 @@ events_init(struct birdloop *loop) static void events_fire(struct birdloop *loop) { - times_update(&loop->time); + times_update(); ev_run_list(&loop->event_list); } @@ -332,7 +332,7 @@ sockets_fire(struct birdloop *loop) sock **psk = loop->poll_sk.data; int poll_num = loop->poll_fd.used - 1; - times_update(&loop->time); + times_update(); /* Last fd is internal wakeup fd */ if (pfd[poll_num].revents & POLLIN) @@ -365,7 +365,7 @@ sockets_fire(struct birdloop *loop) * Birdloop */ -static void * birdloop_main(void *arg); +static void *birdloop_main(void *arg); struct birdloop * birdloop_new(void) @@ -461,7 +461,7 @@ birdloop_main(void *arg) events_fire(loop); timers_fire(&loop->time); - times_update(&loop->time); + times_update(); if (events_waiting(loop)) timeout = 0; else if (t = timers_first(&loop->time)) diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index 40841ea4..90bb5d64 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -123,55 +123,50 @@ rf_fileno(struct rfile *f) btime boot_time; + void -times_init(struct timeloop *loop) +times_update(void) { struct timespec ts; int rv; + btime old_time = current_time(); + btime old_real_time = current_real_time(); + rv = clock_gettime(CLOCK_MONOTONIC, &ts); if (rv < 0) die("Monotonic clock is missing"); if ((ts.tv_sec < 0) || (((u64) ts.tv_sec) > ((u64) 1 << 40))) log(L_WARN "Monotonic clock is crazy"); - - loop->last_time = ts.tv_sec S + ts.tv_nsec NS; - loop->real_time = 0; -} - -void -times_update(struct timeloop *loop) -{ - struct timespec ts; - int rv; - - rv = clock_gettime(CLOCK_MONOTONIC, &ts); - if (rv < 0) - die("clock_gettime: %m"); - + btime new_time = ts.tv_sec S + ts.tv_nsec NS; - if (new_time < loop->last_time) + if (new_time < old_time) log(L_ERR "Monotonic clock is broken"); - loop->last_time = new_time; - loop->real_time = 0; -} - -void -times_update_real_time(struct timeloop *loop) -{ - struct timespec ts; - int rv; - rv = clock_gettime(CLOCK_REALTIME, &ts); if (rv < 0) die("clock_gettime: %m"); - loop->real_time = ts.tv_sec S + ts.tv_nsec NS; -} + btime new_real_time = ts.tv_sec S + ts.tv_nsec NS; + if (!atomic_compare_exchange_strong_explicit( + &last_time, + &old_time, + new_time, + memory_order_acq_rel, + memory_order_relaxed)) + DBG("Time update collision: last_time"); + + if (!atomic_compare_exchange_strong_explicit( + &real_time, + &old_real_time, + new_real_time, + memory_order_acq_rel, + memory_order_relaxed)) + DBG("Time update collision: real_time"); +} /** * DOC: Sockets @@ -2017,30 +2012,17 @@ struct event_log_entry static struct event_log_entry event_log[EVENT_LOG_LENGTH]; static struct event_log_entry *event_open; static int event_log_pos, event_log_num, watchdog_active; -static btime last_time; +static btime last_io_time; static btime loop_time; static void io_update_time(void) { - struct timespec ts; - int rv; - - /* - * This is third time-tracking procedure (after update_times() above and - * times_update() in BFD), dedicated to internal event log and latency - * tracking. Hopefully, we consolidate these sometimes. - */ - - rv = clock_gettime(CLOCK_MONOTONIC, &ts); - if (rv < 0) - die("clock_gettime: %m"); - - last_time = ts.tv_sec S + ts.tv_nsec NS; + last_io_time = current_time(); if (event_open) { - event_open->duration = last_time - event_open->timestamp; + event_open->duration = last_io_time - event_open->timestamp; if (event_open->duration > config->latency_limit) log(L_WARN "Event 0x%p 0x%p took %d ms", @@ -2069,7 +2051,7 @@ io_log_event(void *hook, void *data) en->hook = hook; en->data = data; - en->timestamp = last_time; + en->timestamp = last_io_time; en->duration = 0; event_log_num++; @@ -2097,14 +2079,14 @@ io_log_dump(void) struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH; if (en->hook) log(L_DEBUG " Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data, - (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS)); + (int) ((last_io_time - en->timestamp) TO_MS), (int) (en->duration TO_MS)); } } void watchdog_sigalrm(int sig UNUSED) { - /* Update last_time and duration, but skip latency check */ + /* Update last_io_time and duration, but skip latency check */ config->latency_limit = 0xffffffff; io_update_time(); @@ -2117,7 +2099,7 @@ watchdog_start1(void) { io_update_time(); - loop_time = last_time; + loop_time = last_io_time; } static inline void @@ -2125,7 +2107,7 @@ watchdog_start(void) { io_update_time(); - loop_time = last_time; + loop_time = last_io_time; event_log_num = 0; if (config->watchdog_timeout) @@ -2146,7 +2128,7 @@ watchdog_stop(void) watchdog_active = 0; } - btime duration = last_time - loop_time; + btime duration = last_io_time - loop_time; if (duration > config->watchdog_warning) log(L_WARN "I/O loop cycle took %d ms for %d events", (int) (duration TO_MS), event_log_num); @@ -2202,7 +2184,7 @@ io_loop(void) watchdog_start1(); for(;;) { - times_update(&main_timeloop); + times_update(); events = ev_run_list(&global_event_list); events = ev_run_list_limited(&global_work_list, WORK_EVENTS_MAX) || events; timers_fire(&main_timeloop); @@ -2212,7 +2194,7 @@ io_loop(void) poll_tout = (events ? 0 : 3000); /* Time in milliseconds */ if (t = timers_first(&main_timeloop)) { - times_update(&main_timeloop); + times_update(); timeout = (tm_remains(t) TO_MS) + 1; poll_tout = MIN(poll_tout, timeout); } @@ -2302,7 +2284,7 @@ io_loop(void) continue; } - times_update(&main_timeloop); + times_update(); /* guaranteed to be non-empty */ current_sock = SKIP_BACK(sock, n, HEAD(sock_list)); diff --git a/sysdep/unix/main.c b/sysdep/unix/main.c index dabfc554..d35424ff 100644 --- a/sysdep/unix/main.c +++ b/sysdep/unix/main.c @@ -903,6 +903,7 @@ main(int argc, char **argv) dmalloc_debug(0x2f03d00); #endif + times_update(); resource_sys_init(); parse_args(argc, argv); log_switch(1, NULL, NULL); From 94eb0858c2b938549d9d1703c872c6149901e7dd Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Sat, 19 Jun 2021 20:50:18 +0200 Subject: [PATCH 17/59] Converting the former BFD loop to a universal IO loop and protocol loop. There is a simple universal IO loop, taking care of events, timers and sockets. Primarily, one instance of a protocol should use exactly one IO loop to do all its work, as is now done in BFD. Contrary to previous versions, the loop is now launched and cleaned by the nest/proto.c code, allowing for a protocol to just request its own loop by setting the loop's lock order in config higher than the_bird. It is not supported nor checked if any protocol changed the requested lock order in reconfigure. No protocol should do it at all. --- filter/filter_test.c | 1 + lib/birdlib.h | 1 + lib/coro.h | 5 +- lib/event.c | 133 ++++++++++------- lib/event.h | 41 +++++- lib/event_test.c | 7 +- lib/flowspec_test.c | 3 + lib/io-loop.h | 54 +++++++ lib/lists.h | 12 ++ lib/locking.h | 21 ++- lib/socket.h | 3 + lib/timer.c | 53 +++---- lib/timer.h | 28 ++-- nest/a-path_test.c | 3 + nest/a-set_test.c | 4 + nest/config.Y | 1 + nest/proto.c | 173 ++++++++++++++-------- nest/protocol.h | 7 +- nest/route.h | 3 + nest/rt-table.c | 18 ++- proto/bfd/bfd.c | 166 ++++++++++++--------- proto/bfd/bfd.h | 8 +- proto/bfd/config.Y | 1 + proto/bfd/packets.c | 4 +- proto/bgp/bgp.c | 1 + sysdep/unix/coroutine.c | 21 ++- sysdep/unix/io-loop.c | 317 ++++++++++++++++++++++------------------ sysdep/unix/io-loop.h | 41 +++--- sysdep/unix/io.c | 85 +++++++---- sysdep/unix/main.c | 5 +- sysdep/unix/unix.h | 1 - test/bt-utils.c | 7 +- 32 files changed, 783 insertions(+), 445 deletions(-) create mode 100644 lib/io-loop.h diff --git a/filter/filter_test.c b/filter/filter_test.c index 7e4af092..2a0b5431 100644 --- a/filter/filter_test.c +++ b/filter/filter_test.c @@ -72,6 +72,7 @@ int main(int argc, char *argv[]) { bt_init(argc, argv); + bt_bird_init(); bt_assert_hook = bt_assert_filter; diff --git a/lib/birdlib.h b/lib/birdlib.h index 3dc39d19..385bf75c 100644 --- a/lib/birdlib.h +++ b/lib/birdlib.h @@ -71,6 +71,7 @@ static inline int u64_cmp(u64 i1, u64 i2) /* Macros for gcc attributes */ #define NORET __attribute__((noreturn)) +#define USE_RESULT __atribute__((warn_unused_result)) #define UNUSED __attribute__((unused)) #define PACKED __attribute__((packed)) #define NONNULL(...) __attribute__((nonnull((__VA_ARGS__)))) diff --git a/lib/coro.h b/lib/coro.h index 51712b36..17ccff89 100644 --- a/lib/coro.h +++ b/lib/coro.h @@ -2,7 +2,7 @@ * BIRD Coroutines * * (c) 2017 Martin Mares - * (c) 2020 Maria Matejka + * (c) 2020-2021 Maria Matejka * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -22,5 +22,8 @@ struct coroutine; */ struct coroutine *coro_run(pool *, void (*entry)(void *), void *data); +/* Get self. */ +extern _Thread_local struct coroutine *this_coro; + #endif diff --git a/lib/event.c b/lib/event.c index 273447e0..6c5c8b14 100644 --- a/lib/event.c +++ b/lib/event.c @@ -19,8 +19,14 @@ * events in them and explicitly ask to run them. */ +#undef LOCAL_DEBUG + #include "nest/bird.h" #include "lib/event.h" +#include "lib/locking.h" +#include "lib/io-loop.h" + +extern _Thread_local struct coroutine *this_coro; event_list global_event_list; event_list global_work_list; @@ -28,11 +34,16 @@ event_list global_work_list; inline void ev_postpone(event *e) { + event_list *el = e->list; + if (!el) + return; + + ASSERT_DIE(birdloop_inside(el->loop)); + + LOCK_DOMAIN(event, el->lock); if (ev_active(e)) - { - rem_node(&e->n); - e->n.next = NULL; - } + rem_node(&e->n); + UNLOCK_DOMAIN(event, el->lock); } static void @@ -95,40 +106,25 @@ ev_run(event *e) * list @l which can be run by calling ev_run_list(). */ inline void -ev_enqueue(event_list *l, event *e) +ev_send(event_list *l, event *e) { - ev_postpone(e); - add_tail(l, &e->n); -} + DBG("ev_send(%p, %p)\n", l, e); + ASSERT_DIE(e->hook); + ASSERT_DIE(!e->list || (e->list == l) || (e->list->loop == l->loop)); -/** - * ev_schedule - schedule an event - * @e: an event - * - * This function schedules an event by enqueueing it to a system-wide - * event list which is run by the platform dependent code whenever - * appropriate. - */ -void -ev_schedule(event *e) -{ - ev_enqueue(&global_event_list, e); -} + e->list = l; -/** - * ev_schedule_work - schedule a work-event. - * @e: an event - * - * This function schedules an event by enqueueing it to a system-wide work-event - * list which is run by the platform dependent code whenever appropriate. This - * is designated for work-events instead of regular events. They are executed - * less often in order to not clog I/O loop. - */ -void -ev_schedule_work(event *e) -{ - if (!ev_active(e)) - add_tail(&global_work_list, &e->n); + LOCK_DOMAIN(event, l->lock); + if (enlisted(&e->n)) + { + UNLOCK_DOMAIN(event, l->lock); + return; + } + + add_tail(&l->events, &e->n); + UNLOCK_DOMAIN(event, l->lock); + + birdloop_ping(l->loop); } void io_log_event(void *hook, void *data); @@ -142,35 +138,64 @@ void io_log_event(void *hook, void *data); int ev_run_list(event_list *l) { - node *n; - list tmp_list; + const _Bool legacy = LEGACY_EVENT_LIST(l); + if (legacy) + ASSERT_THE_BIRD_LOCKED; + + node *n; + + list tmp_list; init_list(&tmp_list); - add_tail_list(&tmp_list, l); - init_list(l); + + /* Move the event list contents to a local list to avoid executing repeatedly added events */ + LOCK_DOMAIN(event, l->lock); + add_tail_list(&tmp_list, &l->events); + init_list(&l->events); + UNLOCK_DOMAIN(event, l->lock); + WALK_LIST_FIRST(n, tmp_list) { event *e = SKIP_BACK(event, n, n); - /* This is ugly hack, we want to log just events executed from the main I/O loop */ - if ((l == &global_event_list) || (l == &global_work_list)) + if (legacy) + { + /* The legacy way of event execution */ io_log_event(e->hook, e->data); - - ev_run(e); + ev_postpone(e); + e->hook(e->data); + } + else + { + // io_log_event(e->hook, e->data); /* TODO: add support for event logging in other io loops */ + ASSERT_DIE(e->list == l); + LOCK_DOMAIN(event, l->lock); + rem_node(&e->n); + UNLOCK_DOMAIN(event, l->lock); + e->hook(e->data); + } } - return !EMPTY_LIST(*l); + LOCK_DOMAIN(event, l->lock); + int repeat = ! EMPTY_LIST(l->events); + UNLOCK_DOMAIN(event, l->lock); + return repeat; } int ev_run_list_limited(event_list *l, uint limit) { + ASSERT_DIE(LEGACY_EVENT_LIST(l)); + ASSERT_THE_BIRD_LOCKED; + node *n; list tmp_list; + LOCK_DOMAIN(event, l->lock); init_list(&tmp_list); - add_tail_list(&tmp_list, l); - init_list(l); + add_tail_list(&tmp_list, &l->events); + init_list(&l->events); + UNLOCK_DOMAIN(event, l->lock); WALK_LIST_FIRST(n, tmp_list) { @@ -179,21 +204,23 @@ ev_run_list_limited(event_list *l, uint limit) if (!limit) break; - /* This is ugly hack, we want to log just events executed from the main I/O loop */ - if ((l == &global_event_list) || (l == &global_work_list)) - io_log_event(e->hook, e->data); + io_log_event(e->hook, e->data); ev_run(e); limit--; } + LOCK_DOMAIN(event, l->lock); if (!EMPTY_LIST(tmp_list)) { /* Attach new items after the unprocessed old items */ - add_tail_list(&tmp_list, l); - init_list(l); - add_tail_list(l, &tmp_list); + add_tail_list(&tmp_list, &l->events); + init_list(&l->events); + add_tail_list(&l->events, &tmp_list); } - return !EMPTY_LIST(*l); + int repeat = ! EMPTY_LIST(l->events); + UNLOCK_DOMAIN(event, l->lock); + + return repeat; } diff --git a/lib/event.h b/lib/event.h index 5f3b78d8..6c358f84 100644 --- a/lib/event.h +++ b/lib/event.h @@ -10,33 +10,62 @@ #define _BIRD_EVENT_H_ #include "lib/resource.h" +#include "lib/locking.h" + +#include + +DEFINE_DOMAIN(event); typedef struct event { resource r; void (*hook)(void *); void *data; node n; /* Internal link */ + struct event_list *list; /* List where this event is put in */ } event; -typedef list event_list; +typedef struct event_list { + list events; + pool *pool; + struct birdloop *loop; + DOMAIN(event) lock; +} event_list; extern event_list global_event_list; extern event_list global_work_list; event *ev_new(pool *); void ev_run(event *); -#define ev_init_list(el) init_list(el) -void ev_enqueue(event_list *, event *); -void ev_schedule(event *); -void ev_schedule_work(event *); + +static inline void ev_init_list(event_list *el, struct birdloop *loop, const char *name) +{ + init_list(&el->events); + el->loop = loop; + el->lock = DOMAIN_NEW(event, name); +} + +void ev_send(event_list *, event *); +#define ev_send_loop(l, e) ev_send(birdloop_event_list((l)), (e)) + +#define ev_schedule(e) ({ ASSERT_THE_BIRD_LOCKED; if (!ev_active((e))) ev_send(&global_event_list, (e)); }) +#define ev_schedule_work(e) ({ ASSERT_THE_BIRD_LOCKED; if (!ev_active((e))) ev_send(&global_work_list, (e)); }) + void ev_postpone(event *); int ev_run_list(event_list *); int ev_run_list_limited(event_list *, uint); +#define LEGACY_EVENT_LIST(l) (((l) == &global_event_list) || ((l) == &global_work_list)) + +_Bool birdloop_inside(struct birdloop *loop); + static inline int ev_active(event *e) { - return e->n.next != NULL; + if (e->list == NULL) + return 0; + + ASSERT_DIE(birdloop_inside(e->list->loop)); + return enlisted(&e->n); } static inline event* diff --git a/lib/event_test.c b/lib/event_test.c index e1215bba..9dda3e2a 100644 --- a/lib/event_test.c +++ b/lib/event_test.c @@ -48,14 +48,17 @@ init_event_check_points(void) event_check_points[i] = 0; } +void resource_sys_init(void); + static int t_ev_run_list(void) { int i; + resource_sys_init(); resource_init(); olock_init(); - timer_init(); + birdloop_init(); io_init(); rt_init(); if_init(); @@ -82,7 +85,9 @@ main(int argc, char *argv[]) { bt_init(argc, argv); + the_bird_lock(); bt_test_suite(t_ev_run_list, "Schedule and run 3 events in right order."); + the_bird_unlock(); return bt_exit_value(); } diff --git a/lib/flowspec_test.c b/lib/flowspec_test.c index ed4afe51..f7f70982 100644 --- a/lib/flowspec_test.c +++ b/lib/flowspec_test.c @@ -666,10 +666,13 @@ t_formatting6(void) return 1; } +void resource_sys_init(void); + int main(int argc, char *argv[]) { bt_init(argc, argv); + resource_sys_init(); bt_test_suite(t_read_length, "Testing get NLRI length"); bt_test_suite(t_write_length, "Testing set NLRI length"); diff --git a/lib/io-loop.h b/lib/io-loop.h new file mode 100644 index 00000000..25f1b2a3 --- /dev/null +++ b/lib/io-loop.h @@ -0,0 +1,54 @@ +/* + * BIRD -- I/O and event loop + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_IO_LOOP_H_ +#define _BIRD_IO_LOOP_H_ + +#include "nest/bird.h" +#include "lib/lists.h" +#include "lib/locking.h" +#include "lib/resource.h" +#include "lib/event.h" +#include "lib/socket.h" + +void sk_start(sock *s); +void sk_stop(sock *s); +void sk_reloop(sock *s, struct birdloop *loop); + +extern struct birdloop main_birdloop; + +/* Start a new birdloop owned by given pool and domain */ +struct birdloop *birdloop_new(pool *p, uint order, const char *name); + +/* Stop the loop. At the end, the @stopped callback is called unlocked in tail + * position to finish cleanup. Run birdloop_free() from that callback to free + * the loop itself. */ +void birdloop_stop(struct birdloop *loop, void (*stopped)(void *data), void *data); +void birdloop_stop_self(struct birdloop *loop, void (*stopped)(void *data), void *data); +void birdloop_free(struct birdloop *loop); + +/* Get birdloop's event list */ +event_list *birdloop_event_list(struct birdloop *loop); + +/* Get birdloop's time heap */ +struct timeloop *birdloop_time_loop(struct birdloop *loop); + +/* Enter and exit the birdloop */ +void birdloop_enter(struct birdloop *loop); +void birdloop_leave(struct birdloop *loop); + +_Bool birdloop_inside(struct birdloop *loop); + +void birdloop_mask_wakeups(struct birdloop *loop); +void birdloop_unmask_wakeups(struct birdloop *loop); + +void birdloop_link(struct birdloop *loop); +void birdloop_unlink(struct birdloop *loop); + +void birdloop_ping(struct birdloop *loop); + +void birdloop_init(void); +#endif /* _BIRD_IO_LOOP_H_ */ diff --git a/lib/lists.h b/lib/lists.h index 64b4a981..dc49ec8a 100644 --- a/lib/lists.h +++ b/lib/lists.h @@ -68,6 +68,18 @@ typedef union list { /* In fact two overlayed nodes */ #define EMPTY_LIST(list) (!(list).head->next) +static inline _Bool +enlisted(node *n) +{ + switch ((!!n->next) + (!!n->prev)) + { + case 0: return 0; + case 2: return 1; + case 1: bug("Garbled event list node"); + } + + bug("Maths is broken. And you should see a new heaven and a new earth: for the first heaven and the first earth had been passed away."); +} #ifndef _BIRD_LISTS_C_ #define LIST_INLINE static inline diff --git a/lib/locking.h b/lib/locking.h index eef60154..ab5c06af 100644 --- a/lib/locking.h +++ b/lib/locking.h @@ -14,6 +14,9 @@ struct domain_generic; /* Here define the global lock order; first to last. */ struct lock_order { struct domain_generic *the_bird; + struct domain_generic *proto; + struct domain_generic *rtable; + struct domain_generic *event; }; extern _Thread_local struct lock_order locking_stack; @@ -21,24 +24,40 @@ extern _Thread_local struct domain_generic **last_locked; #define DOMAIN(type) struct domain__##type #define DEFINE_DOMAIN(type) DOMAIN(type) { struct domain_generic *type; } +#define DOMAIN_ORDER(type) OFFSETOF(struct lock_order, type) -#define DOMAIN_NEW(type, name) (DOMAIN(type)) { .type = domain_new(name, OFFSETOF(struct lock_order, type)) } +#define DOMAIN_NEW(type, name) (DOMAIN(type)) { .type = domain_new(name, DOMAIN_ORDER(type)) } struct domain_generic *domain_new(const char *name, uint order); +#define DOMAIN_FREE(type, d) domain_free((d).type) +void domain_free(struct domain_generic *); + #define DOMAIN_NULL(type) (DOMAIN(type)) {} #define LOCK_DOMAIN(type, d) do_lock(((d).type), &(locking_stack.type)) #define UNLOCK_DOMAIN(type, d) do_unlock(((d).type), &(locking_stack.type)) +#define DOMAIN_IS_LOCKED(type, d) (((d).type) == (locking_stack.type)) +#define DG_IS_LOCKED(d) ((d) == *(DG_LSP(d))) + /* Internal for locking */ void do_lock(struct domain_generic *dg, struct domain_generic **lsp); void do_unlock(struct domain_generic *dg, struct domain_generic **lsp); +uint dg_order(struct domain_generic *dg); + +#define DG_LSP(d) ((struct domain_generic **) (((void *) &locking_stack) + dg_order(d))) +#define DG_LOCK(d) do_lock(d, DG_LSP(d)) +#define DG_UNLOCK(d) do_unlock(d, DG_LSP(d)) + /* Use with care. To be removed in near future. */ DEFINE_DOMAIN(the_bird); extern DOMAIN(the_bird) the_bird_domain; #define the_bird_lock() LOCK_DOMAIN(the_bird, the_bird_domain) #define the_bird_unlock() UNLOCK_DOMAIN(the_bird, the_bird_domain) +#define the_bird_locked() DOMAIN_IS_LOCKED(the_bird, the_bird_domain) + +#define ASSERT_THE_BIRD_LOCKED ({ if (!the_bird_locked()) bug("The BIRD lock must be locked here: %s:%d", __FILE__, __LINE__); }) #endif diff --git a/lib/socket.h b/lib/socket.h index 96fedeeb..5bdab7f3 100644 --- a/lib/socket.h +++ b/lib/socket.h @@ -12,6 +12,7 @@ #include #include "lib/resource.h" +#include "lib/event.h" #ifdef HAVE_LIBSSH #define LIBSSH_LEGACY_0_4 #include @@ -79,6 +80,7 @@ typedef struct birdsock { const char *password; /* Password for MD5 authentication */ const char *err; /* Error message */ struct ssh_sock *ssh; /* Used in SK_SSH */ + struct event reloop; /* Reloop event */ } sock; sock *sock_new(pool *); /* Allocate new socket */ @@ -128,6 +130,7 @@ extern int sk_priority_control; /* Suggested priority for control traffic, shou #define SKF_TRUNCATED 0x200 /* Received packet was truncated, set by IO layer */ #define SKF_HDRINCL 0x400 /* Used internally */ #define SKF_PKTINFO 0x800 /* Used internally */ +#define SKF_PASSIVE_THREAD 0x1000 /* Child sockets used in thread, do not add to main loop */ /* * Socket types SA SP DA DP IF TTL SendTo (?=may, -=must not, *=must) diff --git a/lib/timer.c b/lib/timer.c index ff1fb5ef..eb7ea690 100644 --- a/lib/timer.c +++ b/lib/timer.c @@ -37,15 +37,8 @@ #include "lib/resource.h" #include "lib/timer.h" - -struct timeloop main_timeloop; - - #include -/* Data accessed and modified from proto/bfd/io.c */ -_Thread_local struct timeloop *local_timeloop; - _Atomic btime last_time; _Atomic btime real_time; @@ -76,7 +69,7 @@ tm_dump(resource *r) if (t->recurrent) debug("recur %d, ", t->recurrent); if (t->expires) - debug("expires in %d ms)\n", (t->expires - current_time()) TO_MS); + debug("in loop %p expires in %d ms)\n", t->loop, (t->expires - current_time()) TO_MS); else debug("inactive)\n"); } @@ -99,8 +92,8 @@ tm_new(pool *p) return t; } -void -tm_set(timer *t, btime when) +static void +tm_set_in_tl(timer *t, btime when, struct timeloop *local_timeloop) { uint tc = timers_count(local_timeloop); @@ -122,17 +115,17 @@ tm_set(timer *t, btime when) HEAP_DECREASE(local_timeloop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); } -#ifdef CONFIG_BFD - /* Hack to notify BFD loops */ - if ((local_timeloop != &main_timeloop) && (t->index == 1)) - wakeup_kick_current(); -#endif + t->loop = local_timeloop; + + if ((t->index == 1) && (local_timeloop->coro != this_coro)) + birdloop_ping(local_timeloop->loop); } void -tm_start(timer *t, btime after) +tm_set_in(timer *t, btime when, struct birdloop *loop) { - tm_set(t, current_time() + MAX(after, 0)); + ASSERT_DIE(birdloop_inside(loop)); + tm_set_in_tl(t, when, birdloop_time_loop(loop)); } void @@ -141,18 +134,23 @@ tm_stop(timer *t) if (!t->expires) return; - uint tc = timers_count(local_timeloop); + TLOCK_TIMER_ASSERT(t->loop); - HEAP_DELETE(local_timeloop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); - BUFFER_POP(local_timeloop->timers); + uint tc = timers_count(t->loop); + + HEAP_DELETE(t->loop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); + BUFFER_POP(t->loop->timers); t->index = -1; t->expires = 0; + t->loop = NULL; } void timers_init(struct timeloop *loop, pool *p) { + TLOCK_TIMER_ASSERT(loop); + BUFFER_INIT(loop->timers, p, 4); BUFFER_PUSH(loop->timers) = NULL; } @@ -160,8 +158,10 @@ timers_init(struct timeloop *loop, pool *p) void io_log_event(void *hook, void *data); void -timers_fire(struct timeloop *loop) +timers_fire(struct timeloop *loop, int io_log) { + TLOCK_TIMER_ASSERT(loop); + btime base_time; timer *t; @@ -183,26 +183,19 @@ timers_fire(struct timeloop *loop) if (t->randomize) when += random() % (t->randomize + 1); - tm_set(t, when); + tm_set_in_tl(t, when, loop); } else tm_stop(t); /* This is ugly hack, we want to log just timers executed from the main I/O loop */ - if (loop == &main_timeloop) + if (io_log) io_log_event(t->hook, t->data); t->hook(t); } } -void -timer_init(void) -{ - timers_init(&main_timeloop, &root_pool); - local_timeloop = &main_timeloop; -} - /** * tm_parse_time - parse a date and time diff --git a/lib/timer.h b/lib/timer.h index b201b8c8..04544ace 100644 --- a/lib/timer.h +++ b/lib/timer.h @@ -12,6 +12,8 @@ #include "nest/bird.h" #include "lib/buffer.h" +#include "lib/io-loop.h" +#include "lib/locking.h" #include "lib/resource.h" #include @@ -29,22 +31,27 @@ typedef struct timer uint randomize; /* Amount of randomization */ uint recurrent; /* Timer recurrence */ + struct timeloop *loop; /* Loop where the timer is active */ + int index; } timer; struct timeloop { BUFFER_(timer *) timers; + struct domain_generic *domain; + struct birdloop *loop; + struct coroutine *coro; }; +#define TLOCK_TIMER_ASSERT(loop) ASSERT_DIE((loop)->domain && DG_IS_LOCKED((loop)->domain)) +#define TLOCK_LOCAL_ASSERT(loop) ASSERT_DIE(!(loop)->domain || DG_IS_LOCKED((loop)->domain)) + static inline uint timers_count(struct timeloop *loop) -{ return loop->timers.used - 1; } +{ TLOCK_TIMER_ASSERT(loop); return loop->timers.used - 1; } static inline timer *timers_first(struct timeloop *loop) -{ return (loop->timers.used > 1) ? loop->timers.data[1] : NULL; } - -extern struct timeloop main_timeloop; -extern _Thread_local struct timeloop *local_timeloop; +{ TLOCK_TIMER_ASSERT(loop); return (loop->timers.used > 1) ? loop->timers.data[1] : NULL; } #define current_time() atomic_load_explicit(&last_time, memory_order_acquire) #define current_real_time() atomic_load_explicit(&real_time, memory_order_acquire) @@ -54,10 +61,13 @@ extern _Thread_local struct timeloop *local_timeloop; extern btime boot_time; timer *tm_new(pool *p); -void tm_set(timer *t, btime when); -void tm_start(timer *t, btime after); +#define tm_set(t, when) tm_set_in((t), (when), &main_birdloop) +#define tm_start(t, after) tm_start_in((t), (after), &main_birdloop) void tm_stop(timer *t); +void tm_set_in(timer *t, btime when, struct birdloop *loop); +#define tm_start_in(t, after, loop) tm_set_in((t), (current_time() + MAX_((after), 0)), loop) + static inline int tm_active(timer *t) { @@ -101,9 +111,7 @@ void times_update(void); /* For I/O loop */ void timers_init(struct timeloop *loop, pool *p); -void timers_fire(struct timeloop *loop); - -void timer_init(void); +void timers_fire(struct timeloop *loop, int io_log); struct timeformat { diff --git a/nest/a-path_test.c b/nest/a-path_test.c index 9ed0a786..2e6e4956 100644 --- a/nest/a-path_test.c +++ b/nest/a-path_test.c @@ -204,10 +204,13 @@ t_as_path_converting(void) } #endif +void resource_sys_init(void); + int main(int argc, char *argv[]) { bt_init(argc, argv); + resource_sys_init(); bt_test_suite(t_as_path_match, "Testing AS path matching and some a-path utilities."); bt_test_suite(t_path_format, "Testing formating as path into byte buffer"); diff --git a/nest/a-set_test.c b/nest/a-set_test.c index 96b6a727..efd1b67d 100644 --- a/nest/a-set_test.c +++ b/nest/a-set_test.c @@ -240,10 +240,14 @@ t_set_ec_delete(void) return 1; } + +void resource_sys_init(void); + int main(int argc, char *argv[]) { bt_init(argc, argv); + resource_sys_init(); bt_test_suite(t_set_int_contains, "Testing sets of integers: contains, get_data"); bt_test_suite(t_set_int_format, "Testing sets of integers: format"); diff --git a/nest/config.Y b/nest/config.Y index 92f1aad2..6e7689ed 100644 --- a/nest/config.Y +++ b/nest/config.Y @@ -434,6 +434,7 @@ proto: dev_proto '}' ; dev_proto_start: proto_start DIRECT { this_proto = proto_config_new(&proto_device, $1); init_list(&DIRECT_CFG->iface_list); + this_proto->late_if_feed = 1; } ; diff --git a/nest/proto.c b/nest/proto.c index fae0647a..ac0fb232 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -15,6 +15,7 @@ #include "lib/event.h" #include "lib/timer.h" #include "lib/string.h" +#include "lib/coro.h" #include "conf/conf.h" #include "nest/route.h" #include "nest/iface.h" @@ -58,7 +59,28 @@ static void channel_feed_end(struct channel *c); static void channel_export_stopped(struct rt_export_request *req); static inline int proto_is_done(struct proto *p) -{ return (p->proto_state == PS_DOWN) && (p->active_channels == 0); } +{ return (p->proto_state == PS_DOWN) && proto_is_inactive(p); } + +static inline event_list *proto_event_list(struct proto *p) +{ return p->loop == &main_birdloop ? &global_event_list : birdloop_event_list(p->loop); } + +static inline event_list *proto_work_list(struct proto *p) +{ return p->loop == &main_birdloop ? &global_work_list : birdloop_event_list(p->loop); } + +static inline void proto_send_event(struct proto *p) +{ ev_send(proto_event_list(p), p->event); } + +#define PROTO_ENTER_FROM_MAIN(p) ({ \ + ASSERT_DIE(birdloop_inside(&main_birdloop)); \ + struct birdloop *_loop = (p)->loop; \ + if (_loop != &main_birdloop) birdloop_enter(_loop); \ + _loop; \ + }) + +#define PROTO_LEAVE_FROM_MAIN(loop) ({ if (loop != &main_birdloop) birdloop_leave(loop); }) + +#define PROTO_LOCKED_FROM_MAIN(p) for (struct birdloop *_proto_loop = PROTO_ENTER_FROM_MAIN(p); _proto_loop; PROTO_LEAVE_FROM_MAIN(_proto_loop), (_proto_loop = NULL)) + static inline int channel_is_active(struct channel *c) { return (c->channel_state != CS_DOWN); } @@ -473,6 +495,7 @@ channel_start_export(struct channel *c) c->out_req = (struct rt_export_request) { .name = rn, + .list = proto_work_list(c->proto), .trace_routes = c->debug | c->proto->debug, .dump_req = channel_dump_export_req, .log_state_change = channel_export_log_state_change, @@ -517,7 +540,7 @@ channel_check_stopped(struct channel *c) return; channel_set_state(c, CS_DOWN); - ev_schedule(c->proto->event); + proto_send_event(c->proto); break; case CS_PAUSE: @@ -853,6 +876,7 @@ channel_setup_in_table(struct channel *c, int best) }; c->in_table->get = (struct rt_export_request) { .name = cat->name, + .list = proto_work_list(c->proto), .trace_routes = c->debug | c->proto->debug, .dump_req = channel_in_get_dump_req, .log_state_change = channel_get_log_state_change, @@ -895,6 +919,7 @@ channel_setup_out_table(struct channel *c) }; c->out_table->get = (struct rt_export_request) { .name = cat->name, + .list = proto_work_list(c->proto), .trace_routes = c->debug | c->proto->debug, .dump_req = channel_out_get_dump_req, .log_state_change = channel_get_log_state_change, @@ -997,7 +1022,7 @@ channel_do_down(struct channel *c) /* Schedule protocol shutddown */ if (proto_is_done(c->proto)) - ev_schedule(c->proto->event); + proto_send_event(c->proto); } void @@ -1085,9 +1110,12 @@ channel_request_table_feeding(struct channel *c) void channel_request_feeding(struct channel *c) { + if (c->gr_wait || !c->proto->rt_notify) + return; + CD(c, "Refeed requested"); - ASSERT(c->out_req.hook); + ASSERT_DIE(c->out_req.hook); if (c->out_table) channel_aux_request_refeed(c->out_table); @@ -1331,18 +1359,36 @@ proto_configure_channel(struct proto *p, struct channel **pc, struct channel_con return 1; } +static void +proto_cleanup(struct proto *p) +{ + rfree(p->pool); + p->pool = NULL; + + p->active = 0; + proto_log_state_change(p); + proto_rethink_goal(p); +} + +static void +proto_loop_stopped(void *ptr) +{ + struct proto *p = ptr; + + birdloop_enter(&main_birdloop); + + p->loop = &main_birdloop; + p->event->list = NULL; + proto_cleanup(p); + + birdloop_leave(&main_birdloop); +} static void proto_event(void *ptr) { struct proto *p = ptr; - if (p->do_start) - { - if_feed_baby(p); - p->do_start = 0; - } - if (p->do_stop) { if (p->proto == &proto_unix_iface) @@ -1351,14 +1397,10 @@ proto_event(void *ptr) } if (proto_is_done(p)) - { - rfree(p->pool); - p->pool = NULL; - - p->active = 0; - proto_log_state_change(p); - proto_rethink_goal(p); - } + if (p->loop != &main_birdloop) + birdloop_stop_self(p->loop, proto_loop_stopped, p); + else + proto_cleanup(p); } @@ -1399,6 +1441,7 @@ proto_init(struct proto_config *c, node *n) struct protocol *pr = c->protocol; struct proto *p = pr->init(c); + p->loop = &main_birdloop; p->proto_state = PS_DOWN; p->last_state_change = current_time(); p->vrf = c->vrf; @@ -1415,11 +1458,27 @@ proto_init(struct proto_config *c, node *n) static void proto_start(struct proto *p) { - /* Here we cannot use p->cf->name since it won't survive reconfiguration */ - p->pool = rp_new(proto_pool, p->proto->name); + DBG("Kicking %s up\n", p->name); + PD(p, "Starting"); + + int ns = strlen("Protocol ") + strlen(p->cf->name) + 1; + void *nb = mb_alloc(proto_pool, ns); + ASSERT_DIE(ns - 1 == bsnprintf(nb, ns, "Protocol %s", p->cf->name)); + + p->pool = rp_new(proto_pool, nb); if (graceful_restart_state == GRS_INIT) p->gr_recovery = 1; + + if (p->cf->loop_order != DOMAIN_ORDER(the_bird)) + p->loop = birdloop_new(p->pool, p->cf->loop_order, nb); + + p->event->list = proto_event_list(p); + + mb_move(nb, p->pool); + + PROTO_LOCKED_FROM_MAIN(p) + proto_notify_state(p, (p->proto->start ? p->proto->start(p) : PS_UP)); } @@ -1455,6 +1514,7 @@ proto_config_new(struct protocol *pr, int class) cf->class = class; cf->debug = new_config->proto_default_debug; cf->mrtdump = new_config->proto_default_mrtdump; + cf->loop_order = DOMAIN_ORDER(the_bird); init_list(&cf->channels); @@ -1743,12 +1803,21 @@ protos_commit(struct config *new, struct config *old, int force_reconfig, int ty proto_rethink_goal(p); } +static void +proto_shutdown(struct proto *p) +{ + if (p->proto_state == PS_START || p->proto_state == PS_UP) + { + /* Going down */ + DBG("Kicking %s down\n", p->name); + PD(p, "Shutting down"); + proto_notify_state(p, (p->proto->shutdown ? p->proto->shutdown(p) : PS_DOWN)); + } +} + static void proto_rethink_goal(struct proto *p) { - struct protocol *q; - byte goal; - if (p->reconfiguring && !p->active) { struct proto_config *nc = p->cf_new; @@ -1768,32 +1837,12 @@ proto_rethink_goal(struct proto *p) /* Determine what state we want to reach */ if (p->disabled || p->reconfiguring) - goal = PS_DOWN; - else - goal = PS_UP; - - q = p->proto; - if (goal == PS_UP) { - if (!p->active) - { - /* Going up */ - DBG("Kicking %s up\n", p->name); - PD(p, "Starting"); - proto_start(p); - proto_notify_state(p, (q->start ? q->start(p) : PS_UP)); - } - } - else - { - if (p->proto_state == PS_START || p->proto_state == PS_UP) - { - /* Going down */ - DBG("Kicking %s down\n", p->name); - PD(p, "Shutting down"); - proto_notify_state(p, (q->shutdown ? q->shutdown(p) : PS_DOWN)); - } + PROTO_LOCKED_FROM_MAIN(p) + proto_shutdown(p); } + else if (!p->active) + proto_start(p); } struct proto * @@ -1998,7 +2047,7 @@ protos_dump_all(void) #define DPF(x) (p->x ? " " #x : "") debug(" protocol %s (%p) state %s with %d active channels flags: %s%s%s%s\n", p->name, p, p_states[p->proto_state], p->active_channels, - DPF(disabled), DPF(active), DPF(do_start), DPF(do_stop), DPF(reconfiguring)); + DPF(disabled), DPF(active), DPF(do_stop), DPF(reconfiguring)); #undef DPF struct channel *c; @@ -2286,8 +2335,8 @@ static inline void proto_do_start(struct proto *p) { p->active = 1; - p->do_start = 1; - ev_schedule(p->event); + if (!p->cf->late_if_feed) + if_feed_baby(p); } static void @@ -2300,6 +2349,9 @@ proto_do_up(struct proto *p) } proto_start_channels(p); + + if (p->cf->late_if_feed) + if_feed_baby(p); } static inline void @@ -2314,9 +2366,6 @@ proto_do_stop(struct proto *p) p->down_sched = 0; p->gr_recovery = 0; - p->do_stop = 1; - ev_schedule(p->event); - if (p->main_source) { rt_unlock_source(p->main_source); @@ -2324,6 +2373,9 @@ proto_do_stop(struct proto *p) } proto_stop_channels(p); + + p->do_stop = 1; + proto_send_event(p); } static void @@ -2334,7 +2386,7 @@ proto_do_down(struct proto *p) /* Shutdown is finished in the protocol event */ if (proto_is_done(p)) - ev_schedule(p->event); + proto_send_event(p); } @@ -2573,7 +2625,7 @@ proto_cmd_disable(struct proto *p, uintptr_t arg, int cnt UNUSED) p->disabled = 1; p->down_code = PDC_CMD_DISABLE; proto_set_message(p, (char *) arg, -1); - proto_rethink_goal(p); + proto_shutdown(p); cli_msg(-9, "%s: disabled", p->name); } @@ -2606,9 +2658,9 @@ proto_cmd_restart(struct proto *p, uintptr_t arg, int cnt UNUSED) p->disabled = 1; p->down_code = PDC_CMD_RESTART; proto_set_message(p, (char *) arg, -1); - proto_rethink_goal(p); + proto_shutdown(p); p->disabled = 0; - proto_rethink_goal(p); + /* After the protocol shuts down, proto_rethink_goal() is run from proto_event. */ cli_msg(-12, "%s: restarted", p->name); } @@ -2683,7 +2735,9 @@ proto_apply_cmd_symbol(const struct symbol *s, void (* cmd)(struct proto *, uint if (s->proto->proto) { - cmd(s->proto->proto, arg, 0); + struct proto *p = s->proto->proto; + PROTO_LOCKED_FROM_MAIN(p) + cmd(p, arg, 0); cli_msg(0, ""); } else @@ -2698,7 +2752,8 @@ proto_apply_cmd_patt(const char *patt, void (* cmd)(struct proto *, uintptr_t, i WALK_LIST(p, proto_list) if (!patt || patmatch(patt, p->name)) - cmd(p, arg, cnt++); + PROTO_LOCKED_FROM_MAIN(p) + cmd(p, arg, cnt++); if (!cnt) cli_msg(8003, "No protocols match"); diff --git a/nest/protocol.h b/nest/protocol.h index f9996b18..2fbd607c 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -120,8 +120,10 @@ struct proto_config { u8 net_type; /* Protocol network type (NET_*), 0 for undefined */ u8 disabled; /* Protocol enabled/disabled by default */ u8 vrf_set; /* Related VRF instance (below) is defined */ + u8 late_if_feed; /* Delay interface feed after channels are up */ u32 debug, mrtdump; /* Debugging bitfields, both use D_* constants */ u32 router_id; /* Protocol specific router ID */ + uint loop_order; /* Launch a birdloop on this locking level; use DOMAIN_ORDER(the_bird) for mainloop */ list channels; /* List of channel configs (struct channel_config) */ struct iface *vrf; /* Related VRF instance, NULL if global */ @@ -139,6 +141,7 @@ struct proto { struct proto_config *cf_new; /* Configuration we want to switch to after shutdown (NULL=delete) */ pool *pool; /* Pool containing local objects */ event *event; /* Protocol event */ + struct birdloop *loop; /* BIRDloop running this protocol */ list channels; /* List of channels to rtables (struct channel) */ struct channel *main_channel; /* Primary channel */ @@ -149,12 +152,12 @@ struct proto { u32 debug; /* Debugging flags */ u32 mrtdump; /* MRTDump flags */ uint active_channels; /* Number of active channels */ + uint active_coroutines; /* Number of active coroutines */ byte net_type; /* Protocol network type (NET_*), 0 for undefined */ byte disabled; /* Manually disabled */ byte vrf_set; /* Related VRF instance (above) is defined */ byte proto_state; /* Protocol state machine (PS_*, see below) */ byte active; /* From PS_START to cleanup after PS_STOP */ - byte do_start; /* Start actions are scheduled */ byte do_stop; /* Stop actions are scheduled */ byte reconfiguring; /* We're shutting down due to reconfiguration */ byte gr_recovery; /* Protocol should participate in graceful restart recovery */ @@ -356,6 +359,8 @@ void proto_notify_state(struct proto *p, unsigned state); * as a result of received ROUTE-REFRESH request). */ +static inline int proto_is_inactive(struct proto *p) +{ return (p->active_channels == 0) && (p->active_coroutines == 0); } /* diff --git a/nest/route.h b/nest/route.h index d6474e09..7c5fd02f 100644 --- a/nest/route.h +++ b/nest/route.h @@ -11,6 +11,7 @@ #define _BIRD_ROUTE_H_ #include "lib/lists.h" +#include "lib/event.h" #include "lib/bitmap.h" #include "lib/resource.h" #include "lib/net.h" @@ -329,6 +330,8 @@ struct rt_export_request { char *name; u8 trace_routes; + event_list *list; /* Where to schedule export events */ + /* There are two methods of export. You can either request feeding every single change * or feeding the whole route feed. In case of regular export, &export_one is preferred. * Anyway, when feeding, &export_bulk is preferred, falling back to &export_one. diff --git a/nest/rt-table.c b/nest/rt-table.c index c049101a..9c12ef56 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -1105,6 +1105,12 @@ rt_next_export(struct rt_export_hook *hook, rtable *tab) return tab->first_export; } +static inline void +rt_send_export_event(struct rt_export_hook *hook) +{ + ev_send(hook->req->list, hook->event); +} + static void rt_announce_exports(timer *tm) { @@ -1116,7 +1122,7 @@ rt_announce_exports(timer *tm) if (atomic_load_explicit(&c->export_state, memory_order_acquire) != TES_READY) continue; - ev_schedule_work(c->event); + rt_send_export_event(c); } } @@ -1169,7 +1175,7 @@ rt_export_hook(void *_data) rte_update_unlock(); } - ev_schedule_work(c->event); + rt_send_export_event(c); } @@ -1732,7 +1738,7 @@ rt_request_export(rtable *tab, struct rt_export_request *req) DBG("New export hook %p req %p in table %s uc=%u\n", hook, req, tab->name, tab->use_count); hook->event = ev_new_init(p, rt_feed_channel, hook); - ev_schedule_work(hook->event); + rt_send_export_event(hook); rt_set_export_state(hook, TES_FEEDING); } @@ -1754,7 +1760,7 @@ rt_stop_export(struct rt_export_request *req, void (*stopped)(struct rt_export_r hook->event->hook = rt_export_stopped; hook->stopped = stopped; - ev_schedule(hook->event); + rt_send_export_event(hook); rt_set_export_state(hook, TES_STOP); } @@ -2869,7 +2875,7 @@ rt_feed_channel(void *data) if (max_feed <= 0) { FIB_ITERATE_PUT(fit); - ev_schedule_work(c->event); + rt_send_export_event(c); return; } @@ -2904,7 +2910,7 @@ rt_feed_channel(void *data) FIB_ITERATE_END; c->event->hook = rt_export_hook; - ev_schedule_work(c->event); + rt_send_export_event(c); rt_set_export_state(c, TES_READY); } diff --git a/proto/bfd/bfd.c b/proto/bfd/bfd.c index dac184c5..3964c267 100644 --- a/proto/bfd/bfd.c +++ b/proto/bfd/bfd.c @@ -113,8 +113,16 @@ #define HASH_IP_EQ(a1,n1,a2,n2) ipa_equal(a1, a2) && n1 == n2 #define HASH_IP_FN(a,n) ipa_hash(a) ^ u32_hash(n) -static list bfd_proto_list; -static list bfd_wait_list; +DEFINE_DOMAIN(rtable); +#define BFD_LOCK LOCK_DOMAIN(rtable, bfd_global.lock) +#define BFD_UNLOCK UNLOCK_DOMAIN(rtable, bfd_global.lock) + +static struct { + DOMAIN(rtable) lock; + list wait_list; + list pickup_list; + list proto_list; +} bfd_global; const char *bfd_state_names[] = { "AdminDown", "Down", "Init", "Up" }; @@ -188,7 +196,7 @@ bfd_session_update_tx_interval(struct bfd_session *s) return; /* Set timer relative to last tx_timer event */ - tm_set(s->tx_timer, s->last_tx + tx_int_l); + tm_set_in(s->tx_timer, s->last_tx + tx_int_l, s->ifa->bfd->p.loop); } static void @@ -202,7 +210,7 @@ bfd_session_update_detection_time(struct bfd_session *s, int kick) if (!s->last_rx) return; - tm_set(s->hold_timer, s->last_rx + timeout); + tm_set_in(s->hold_timer, s->last_rx + timeout, s->ifa->bfd->p.loop); } static void @@ -226,7 +234,7 @@ bfd_session_control_tx_timer(struct bfd_session *s, int reset) if (reset || !tm_active(s->tx_timer)) { s->last_tx = 0; - tm_start(s->tx_timer, 0); + tm_start_in(s->tx_timer, 0, s->ifa->bfd->p.loop); } return; @@ -419,7 +427,7 @@ bfd_get_free_id(struct bfd_proto *p) static struct bfd_session * bfd_add_session(struct bfd_proto *p, ip_addr addr, ip_addr local, struct iface *iface, struct bfd_options *opts) { - birdloop_enter(p->loop); + ASSERT_DIE(birdloop_inside(p->p.loop)); struct bfd_iface *ifa = bfd_get_iface(p, local, iface); @@ -454,8 +462,6 @@ bfd_add_session(struct bfd_proto *p, ip_addr addr, ip_addr local, struct iface * TRACE(D_EVENTS, "Session to %I added", s->addr); - birdloop_leave(p->loop); - return s; } @@ -463,38 +469,34 @@ bfd_add_session(struct bfd_proto *p, ip_addr addr, ip_addr local, struct iface * static void bfd_open_session(struct bfd_proto *p, struct bfd_session *s, ip_addr local, struct iface *ifa) { - birdloop_enter(p->loop); + birdloop_enter(p->p.loop); s->opened = 1; bfd_session_control_tx_timer(s); - birdloop_leave(p->loop); + birdloop_leave(p->p.loop); } static void bfd_close_session(struct bfd_proto *p, struct bfd_session *s) { - birdloop_enter(p->loop); + birdloop_enter(p->p.loop); s->opened = 0; bfd_session_update_state(s, BFD_STATE_DOWN, BFD_DIAG_PATH_DOWN); bfd_session_control_tx_timer(s); - birdloop_leave(p->loop); + birdloop_leave(p->p.loop); } */ static void -bfd_remove_session(struct bfd_proto *p, struct bfd_session *s) +bfd_remove_session_locked(struct bfd_proto *p, struct bfd_session *s) { - ip_addr ip = s->addr; - /* Caller should ensure that request list is empty */ - birdloop_enter(p->loop); - /* Remove session from notify list if scheduled for notification */ /* No need for bfd_lock_sessions(), we are already protected by birdloop_enter() */ if (NODE_VALID(&s->n)) @@ -508,11 +510,17 @@ bfd_remove_session(struct bfd_proto *p, struct bfd_session *s) HASH_REMOVE(p->session_hash_id, HASH_ID, s); HASH_REMOVE(p->session_hash_ip, HASH_IP, s); + TRACE(D_EVENTS, "Session to %I removed", s->addr); + sl_free(p->session_slab, s); +} - TRACE(D_EVENTS, "Session to %I removed", ip); - - birdloop_leave(p->loop); +static void +bfd_remove_session(struct bfd_proto *p, struct bfd_session *s) +{ + birdloop_enter(p->p.loop); + bfd_remove_session_locked(p, s); + birdloop_leave(p->p.loop); } static void @@ -521,7 +529,7 @@ bfd_reconfigure_session(struct bfd_proto *p, struct bfd_session *s) if (EMPTY_LIST(s->request_list)) return; - birdloop_enter(p->loop); + birdloop_enter(p->p.loop); struct bfd_request *req = SKIP_BACK(struct bfd_request, n, HEAD(s->request_list)); s->cf = bfd_merge_options(s->ifa->cf, &req->opts); @@ -534,7 +542,7 @@ bfd_reconfigure_session(struct bfd_proto *p, struct bfd_session *s) bfd_session_control_tx_timer(s, 0); - birdloop_leave(p->loop); + birdloop_leave(p->p.loop); TRACE(D_EVENTS, "Session to %I reconfigured", s->addr); } @@ -618,9 +626,9 @@ bfd_reconfigure_iface(struct bfd_proto *p, struct bfd_iface *ifa, struct bfd_con (new->passive != old->passive); /* This should be probably changed to not access ifa->cf from the BFD thread */ - birdloop_enter(p->loop); + birdloop_enter(p->p.loop); ifa->cf = new; - birdloop_leave(p->loop); + birdloop_leave(p->p.loop); } @@ -681,41 +689,68 @@ bfd_add_request(struct bfd_proto *p, struct bfd_request *req) } static void -bfd_submit_request(struct bfd_request *req) +bfd_pickup_requests(void *_data UNUSED) { node *n; + WALK_LIST(n, bfd_global.proto_list) + { + struct bfd_proto *p = SKIP_BACK(struct bfd_proto, bfd_node, n); + birdloop_enter(p->p.loop); + BFD_LOCK; - WALK_LIST(n, bfd_proto_list) - if (bfd_add_request(SKIP_BACK(struct bfd_proto, bfd_node, n), req)) - return; + node *rn, *rnxt; + WALK_LIST_DELSAFE(rn, rnxt, bfd_global.pickup_list) + bfd_add_request(p, SKIP_BACK(struct bfd_request, n, rn)); - rem_node(&req->n); - add_tail(&bfd_wait_list, &req->n); - req->session = NULL; - bfd_request_notify(req, BFD_STATE_ADMIN_DOWN, 0); + BFD_UNLOCK; + birdloop_leave(p->p.loop); + } + + BFD_LOCK; + node *rn, *rnxt; + WALK_LIST_DELSAFE(rn, rnxt, bfd_global.pickup_list) + { + rem_node(rn); + add_tail(&bfd_global.wait_list, rn); + bfd_request_notify(SKIP_BACK(struct bfd_request, n, rn), BFD_STATE_ADMIN_DOWN, 0); + } + BFD_UNLOCK; } +static event bfd_pickup_event = { .hook = bfd_pickup_requests }; + static void bfd_take_requests(struct bfd_proto *p) { node *n, *nn; - - WALK_LIST_DELSAFE(n, nn, bfd_wait_list) + BFD_LOCK; + WALK_LIST_DELSAFE(n, nn, bfd_global.wait_list) bfd_add_request(p, SKIP_BACK(struct bfd_request, n, n)); + BFD_UNLOCK; } static void bfd_drop_requests(struct bfd_proto *p) { node *n; - - HASH_WALK(p->session_hash_id, next_id, s) + BFD_LOCK; + HASH_WALK_DELSAFE(p->session_hash_id, next_id, s) { - /* We assume that p is not in bfd_proto_list */ WALK_LIST_FIRST(n, s->request_list) - bfd_submit_request(SKIP_BACK(struct bfd_request, n, n)); + { + struct bfd_request *req = SKIP_BACK(struct bfd_request, n, n); + rem_node(&req->n); + add_tail(&bfd_global.pickup_list, &req->n); + req->session = NULL; + bfd_request_notify(req, BFD_STATE_ADMIN_DOWN, 0); + } + + ev_send(&global_event_list, &bfd_pickup_event); + + bfd_remove_session_locked(p, s); } HASH_WALK_END; + BFD_UNLOCK; } static struct resclass bfd_request_class; @@ -728,9 +763,6 @@ bfd_request_session(pool *p, ip_addr addr, ip_addr local, { struct bfd_request *req = ralloc(p, &bfd_request_class); - /* Hack: self-link req->n, we will call rem_node() on it */ - req->n.prev = req->n.next = &req->n; - req->addr = addr; req->local = local; req->iface = iface; @@ -739,11 +771,16 @@ bfd_request_session(pool *p, ip_addr addr, ip_addr local, if (opts) req->opts = *opts; - bfd_submit_request(req); - req->hook = hook; req->data = data; + req->session = NULL; + + BFD_LOCK; + add_tail(&bfd_global.pickup_list, &req->n); + ev_send(&global_event_list, &bfd_pickup_event); + BFD_UNLOCK; + return req; } @@ -1001,8 +1038,10 @@ bfd_notify_init(struct bfd_proto *p) void bfd_init_all(void) { - init_list(&bfd_proto_list); - init_list(&bfd_wait_list); + bfd_global.lock = DOMAIN_NEW(rtable, "BFD Global"); + init_list(&bfd_global.wait_list); + init_list(&bfd_global.pickup_list); + init_list(&bfd_global.proto_list); } static struct proto * @@ -1021,10 +1060,10 @@ bfd_start(struct proto *P) struct bfd_proto *p = (struct bfd_proto *) P; struct bfd_config *cf = (struct bfd_config *) (P->cf); - p->loop = birdloop_new(); - p->tpool = rp_new(NULL, "BFD thread root"); pthread_spin_init(&p->lock, PTHREAD_PROCESS_PRIVATE); + p->tpool = rp_new(P->pool, "BFD loop pool"); + p->session_slab = sl_new(P->pool, sizeof(struct bfd_session)); HASH_INIT(p->session_hash_id, P->pool, 8); HASH_INIT(p->session_hash_ip, P->pool, 8); @@ -1034,9 +1073,7 @@ bfd_start(struct proto *P) init_list(&p->notify_list); bfd_notify_init(p); - add_tail(&bfd_proto_list, &p->bfd_node); - - birdloop_enter(p->loop); + add_tail(&bfd_global.proto_list, &p->bfd_node); if (cf->accept_ipv4 && cf->accept_direct) p->rx4_1 = bfd_open_rx_sk(p, 0, SK_IPV4); @@ -1050,42 +1087,33 @@ bfd_start(struct proto *P) if (cf->accept_ipv6 && cf->accept_multihop) p->rx6_m = bfd_open_rx_sk(p, 1, SK_IPV6); - birdloop_leave(p->loop); - bfd_take_requests(p); struct bfd_neighbor *n; WALK_LIST(n, cf->neigh_list) bfd_start_neighbor(p, n); - birdloop_start(p->loop); - return PS_UP; } - static int bfd_shutdown(struct proto *P) { struct bfd_proto *p = (struct bfd_proto *) P; - struct bfd_config *cf = (struct bfd_config *) (P->cf); + struct bfd_config *cf = (struct bfd_config *) (p->p.cf); rem_node(&p->bfd_node); - birdloop_stop(p->loop); - - struct bfd_neighbor *n; - WALK_LIST(n, cf->neigh_list) - bfd_stop_neighbor(p, n); + struct bfd_neighbor *bn; + WALK_LIST(bn, cf->neigh_list) + bfd_stop_neighbor(p, bn); bfd_drop_requests(p); - /* FIXME: This is hack */ - birdloop_enter(p->loop); - rfree(p->tpool); - birdloop_leave(p->loop); - - birdloop_free(p->loop); + if (p->rx4_1) sk_stop(p->rx4_1); + if (p->rx4_m) sk_stop(p->rx4_m); + if (p->rx6_1) sk_stop(p->rx6_1); + if (p->rx6_m) sk_stop(p->rx6_m); return PS_DOWN; } @@ -1105,7 +1133,7 @@ bfd_reconfigure(struct proto *P, struct proto_config *c) (new->accept_multihop != old->accept_multihop)) return 0; - birdloop_mask_wakeups(p->loop); + birdloop_mask_wakeups(p->p.loop); WALK_LIST(ifa, p->iface_list) bfd_reconfigure_iface(p, ifa, new); @@ -1119,7 +1147,7 @@ bfd_reconfigure(struct proto *P, struct proto_config *c) bfd_reconfigure_neighbors(p, new); - birdloop_unmask_wakeups(p->loop); + birdloop_unmask_wakeups(p->p.loop); return 1; } diff --git a/proto/bfd/bfd.h b/proto/bfd/bfd.h index 9d4cbbf8..8430064b 100644 --- a/proto/bfd/bfd.h +++ b/proto/bfd/bfd.h @@ -17,12 +17,12 @@ #include "nest/password.h" #include "conf/conf.h" #include "lib/hash.h" +#include "lib/io-loop.h" #include "lib/resource.h" #include "lib/socket.h" #include "lib/string.h" #include "nest/bfd.h" -#include "sysdep/unix/io-loop.h" #define BFD_CONTROL_PORT 3784 @@ -87,9 +87,11 @@ struct bfd_neighbor struct bfd_proto { struct proto p; - struct birdloop *loop; - pool *tpool; + pthread_spinlock_t lock; + + pool *tpool; + node bfd_node; slab *session_slab; diff --git a/proto/bfd/config.Y b/proto/bfd/config.Y index df1cba42..ed5479fb 100644 --- a/proto/bfd/config.Y +++ b/proto/bfd/config.Y @@ -36,6 +36,7 @@ proto: bfd_proto ; bfd_proto_start: proto_start BFD { this_proto = proto_config_new(&proto_bfd, $1); + this_proto->loop_order = DOMAIN_ORDER(proto); init_list(&BFD_CFG->patt_list); init_list(&BFD_CFG->neigh_list); BFD_CFG->accept_ipv4 = BFD_CFG->accept_ipv6 = 1; diff --git a/proto/bfd/packets.c b/proto/bfd/packets.c index 7618e20f..37d77f37 100644 --- a/proto/bfd/packets.c +++ b/proto/bfd/packets.c @@ -410,7 +410,7 @@ bfd_err_hook(sock *sk, int err) sock * bfd_open_rx_sk(struct bfd_proto *p, int multihop, int af) { - sock *sk = sk_new(p->tpool); + sock *sk = sk_new(p->p.pool); sk->type = SK_UDP; sk->subtype = af; sk->sport = !multihop ? BFD_CONTROL_PORT : BFD_MULTI_CTL_PORT; @@ -441,7 +441,7 @@ bfd_open_rx_sk(struct bfd_proto *p, int multihop, int af) sock * bfd_open_tx_sk(struct bfd_proto *p, ip_addr local, struct iface *ifa) { - sock *sk = sk_new(p->tpool); + sock *sk = sk_new(p->p.pool); sk->type = SK_UDP; sk->saddr = local; sk->dport = ifa ? BFD_CONTROL_PORT : BFD_MULTI_CTL_PORT; diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 35e9ea59..96df671f 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -853,6 +853,7 @@ bgp_graceful_restart_feed(struct bgp_channel *c) { c->stale_feed = (struct rt_export_request) { .name = "BGP-GR", + .list = &global_work_list, .trace_routes = c->c.debug | c->c.proto->debug, .dump_req = bgp_graceful_restart_feed_dump_req, .log_state_change = bgp_graceful_restart_feed_log_state_change, diff --git a/sysdep/unix/coroutine.c b/sysdep/unix/coroutine.c index 2eba142c..4758c056 100644 --- a/sysdep/unix/coroutine.c +++ b/sysdep/unix/coroutine.c @@ -21,10 +21,9 @@ #include "lib/resource.h" #include "lib/timer.h" -/* Using a rather big stack for coroutines to allow for stack-local allocations. - * In real world, the kernel doesn't alloc this memory until it is used. - * */ -#define CORO_STACK_SIZE 1048576 +#include "conf/conf.h" + +#define CORO_STACK_SIZE 65536 /* * Implementation of coroutines based on POSIX threads @@ -79,6 +78,11 @@ domain_free(struct domain_generic *dg) xfree(dg); } +uint dg_order(struct domain_generic *dg) +{ + return dg->order; +} + void do_lock(struct domain_generic *dg, struct domain_generic **lsp) { if ((char *) lsp - (char *) &locking_stack != dg->order) @@ -89,7 +93,11 @@ void do_lock(struct domain_generic *dg, struct domain_generic **lsp) if (*lsp) bug("Inconsistent locking stack state on lock"); + btime lock_begin = current_time(); pthread_mutex_lock(&dg->mutex); + btime duration = current_time() - lock_begin; + if (config && (duration > config->watchdog_warning)) + log(L_WARN "Locking of %s took %d ms", dg->name, (int) (duration TO_MS)); if (dg->prev || dg->locked_by) bug("Previous unlock not finished correctly"); @@ -140,11 +148,16 @@ static struct resclass coro_class = { .free = coro_free, }; +_Thread_local struct coroutine *this_coro = NULL; + static void *coro_entry(void *p) { struct coroutine *c = p; + ASSERT_DIE(c->entry); + this_coro = c; + c->entry(c->data); ASSERT_DIE(coro_cleaned_up); diff --git a/sysdep/unix/io-loop.c b/sysdep/unix/io-loop.c index a15d866a..275d38a0 100644 --- a/sysdep/unix/io-loop.c +++ b/sysdep/unix/io-loop.c @@ -15,50 +15,47 @@ #include #include "nest/bird.h" -#include "sysdep/unix/io-loop.h" #include "lib/buffer.h" +#include "lib/coro.h" #include "lib/lists.h" #include "lib/resource.h" #include "lib/event.h" #include "lib/timer.h" #include "lib/socket.h" - -struct birdloop -{ - pool *pool; - pthread_t thread; - pthread_mutex_t mutex; - - u8 stop_called; - u8 poll_active; - u8 wakeup_masked; - int wakeup_fds[2]; - - struct timeloop time; - list event_list; - list sock_list; - uint sock_num; - - BUFFER(sock *) poll_sk; - BUFFER(struct pollfd) poll_fd; - u8 poll_changed; - u8 close_scheduled; -}; - +#include "lib/io-loop.h" +#include "sysdep/unix/io-loop.h" +#include "conf/conf.h" /* * Current thread context */ -static _Thread_local struct birdloop *birdloop_current; +_Thread_local struct birdloop *birdloop_current; +static _Thread_local struct birdloop *birdloop_wakeup_masked; +static _Thread_local uint birdloop_wakeup_masked_count; -static inline void -birdloop_set_current(struct birdloop *loop) +event_list * +birdloop_event_list(struct birdloop *loop) { - birdloop_current = loop; - local_timeloop = loop ? &loop->time : &main_timeloop; + return &loop->event_list; +} + +struct timeloop * +birdloop_time_loop(struct birdloop *loop) +{ + return &loop->time; +} + +_Bool +birdloop_inside(struct birdloop *loop) +{ + for (struct birdloop *c = birdloop_current; c; c = c->prev_loop) + if (loop == c) + return 1; + + return 0; } /* @@ -135,57 +132,17 @@ wakeup_do_kick(struct birdloop *loop) pipe_kick(loop->wakeup_fds[1]); } -static inline void -wakeup_kick(struct birdloop *loop) +void +birdloop_ping(struct birdloop *loop) { - if (!loop->wakeup_masked) - wakeup_do_kick(loop); + u32 ping_sent = atomic_fetch_add_explicit(&loop->ping_sent, 1, memory_order_acq_rel); + if (ping_sent) + return; + + if (loop == birdloop_wakeup_masked) + birdloop_wakeup_masked_count++; else - loop->wakeup_masked = 2; -} - -/* For notifications from outside */ -void -wakeup_kick_current(void) -{ - if (birdloop_current && birdloop_current->poll_active) - wakeup_kick(birdloop_current); -} - - -/* - * Events - */ - -static inline uint -events_waiting(struct birdloop *loop) -{ - return !EMPTY_LIST(loop->event_list); -} - -static inline void -events_init(struct birdloop *loop) -{ - init_list(&loop->event_list); -} - -static void -events_fire(struct birdloop *loop) -{ - times_update(); - ev_run_list(&loop->event_list); -} - -void -ev2_schedule(event *e) -{ - if (birdloop_current->poll_active && EMPTY_LIST(birdloop_current->event_list)) - wakeup_kick(birdloop_current); - - if (e->n.next) - rem_node(&e->n); - - add_tail(&birdloop_current->event_list, &e->n); + wakeup_do_kick(loop); } @@ -213,13 +170,13 @@ sockets_add(struct birdloop *loop, sock *s) s->index = -1; loop->poll_changed = 1; - if (loop->poll_active) - wakeup_kick(loop); + birdloop_ping(loop); } void sk_start(sock *s) { + ASSERT_DIE(birdloop_current != &main_birdloop); sockets_add(birdloop_current, s); } @@ -230,28 +187,21 @@ sockets_remove(struct birdloop *loop, sock *s) loop->sock_num--; if (s->index >= 0) + { loop->poll_sk.data[s->index] = NULL; - - s->index = -1; - loop->poll_changed = 1; - - /* Wakeup moved to sk_stop() */ + s->index = -1; + loop->poll_changed = 1; + loop->close_scheduled = 1; + birdloop_ping(loop); + } + else + close(s->fd); } void sk_stop(sock *s) { sockets_remove(birdloop_current, s); - - if (birdloop_current->poll_active) - { - birdloop_current->close_scheduled = 1; - wakeup_kick(birdloop_current); - } - else - close(s->fd); - - s->fd = -1; } static inline uint sk_want_events(sock *s) @@ -351,12 +301,15 @@ sockets_fire(struct birdloop *loop) if (pfd->revents & POLLIN) while (e && *psk && (*psk)->rx_hook) - e = sk_read(*psk, 0); + e = sk_read(*psk, pfd->revents); e = 1; if (pfd->revents & POLLOUT) + { + loop->poll_changed = 1; while (e && *psk) e = sk_write(*psk); + } } } @@ -365,104 +318,168 @@ sockets_fire(struct birdloop *loop) * Birdloop */ -static void *birdloop_main(void *arg); +struct birdloop main_birdloop; + +static void birdloop_enter_locked(struct birdloop *loop); + +void +birdloop_init(void) +{ + wakeup_init(&main_birdloop); + + main_birdloop.time.domain = the_bird_domain.the_bird; + main_birdloop.time.loop = &main_birdloop; + + times_update(); + timers_init(&main_birdloop.time, &root_pool); + + birdloop_enter_locked(&main_birdloop); +} + +static void birdloop_main(void *arg); struct birdloop * -birdloop_new(void) +birdloop_new(pool *pp, uint order, const char *name) { - pool *p = rp_new(NULL, "Birdloop root"); + struct domain_generic *dg = domain_new(name, order); + + pool *p = rp_new(pp, name); struct birdloop *loop = mb_allocz(p, sizeof(struct birdloop)); loop->pool = p; - pthread_mutex_init(&loop->mutex, NULL); + + loop->time.domain = dg; + loop->time.loop = loop; + + birdloop_enter(loop); wakeup_init(loop); - - events_init(loop); + ev_init_list(&loop->event_list, loop, name); timers_init(&loop->time, p); sockets_init(loop); + loop->time.coro = coro_run(p, birdloop_main, loop); + + birdloop_leave(loop); + return loop; } -void -birdloop_start(struct birdloop *loop) +static void +birdloop_do_stop(struct birdloop *loop, void (*stopped)(void *data), void *data) { - int rv = pthread_create(&loop->thread, NULL, birdloop_main, loop); - if (rv) - die("pthread_create(): %M", rv); + loop->stopped = stopped; + loop->stop_data = data; + wakeup_do_kick(loop); } void -birdloop_stop(struct birdloop *loop) +birdloop_stop(struct birdloop *loop, void (*stopped)(void *data), void *data) { - pthread_mutex_lock(&loop->mutex); - loop->stop_called = 1; - wakeup_do_kick(loop); - pthread_mutex_unlock(&loop->mutex); + DG_LOCK(loop->time.domain); + birdloop_do_stop(loop, stopped, data); + DG_UNLOCK(loop->time.domain); +} - int rv = pthread_join(loop->thread, NULL); - if (rv) - die("pthread_join(): %M", rv); +void +birdloop_stop_self(struct birdloop *loop, void (*stopped)(void *data), void *data) +{ + ASSERT_DIE(loop == birdloop_current); + ASSERT_DIE(DG_IS_LOCKED(loop->time.domain)); + + birdloop_do_stop(loop, stopped, data); } void birdloop_free(struct birdloop *loop) { + ASSERT_DIE(loop->links == 0); + domain_free(loop->time.domain); rfree(loop->pool); } +static void +birdloop_enter_locked(struct birdloop *loop) +{ + ASSERT_DIE(DG_IS_LOCKED(loop->time.domain)); + ASSERT_DIE(!birdloop_inside(loop)); + + /* Store the old context */ + loop->prev_loop = birdloop_current; + + /* Put the new context */ + birdloop_current = loop; +} void birdloop_enter(struct birdloop *loop) { - /* TODO: these functions could save and restore old context */ - pthread_mutex_lock(&loop->mutex); - birdloop_set_current(loop); + DG_LOCK(loop->time.domain); + return birdloop_enter_locked(loop); +} + +static void +birdloop_leave_locked(struct birdloop *loop) +{ + /* Check the current context */ + ASSERT_DIE(birdloop_current == loop); + + /* Restore the old context */ + birdloop_current = loop->prev_loop; } void birdloop_leave(struct birdloop *loop) { - /* TODO: these functions could save and restore old context */ - birdloop_set_current(NULL); - pthread_mutex_unlock(&loop->mutex); + birdloop_leave_locked(loop); + DG_UNLOCK(loop->time.domain); } void birdloop_mask_wakeups(struct birdloop *loop) { - pthread_mutex_lock(&loop->mutex); - loop->wakeup_masked = 1; - pthread_mutex_unlock(&loop->mutex); + ASSERT_DIE(birdloop_wakeup_masked == NULL); + birdloop_wakeup_masked = loop; } void birdloop_unmask_wakeups(struct birdloop *loop) { - pthread_mutex_lock(&loop->mutex); - if (loop->wakeup_masked == 2) + ASSERT_DIE(birdloop_wakeup_masked == loop); + birdloop_wakeup_masked = NULL; + if (birdloop_wakeup_masked_count) wakeup_do_kick(loop); - loop->wakeup_masked = 0; - pthread_mutex_unlock(&loop->mutex); + + birdloop_wakeup_masked_count = 0; } -static void * +void +birdloop_link(struct birdloop *loop) +{ + ASSERT_DIE(birdloop_inside(loop)); + loop->links++; +} + +void +birdloop_unlink(struct birdloop *loop) +{ + ASSERT_DIE(birdloop_inside(loop)); + loop->links--; +} + +static void birdloop_main(void *arg) { struct birdloop *loop = arg; timer *t; int rv, timeout; - birdloop_set_current(loop); + btime loop_begin = current_time(); - pthread_mutex_lock(&loop->mutex); + birdloop_enter(loop); while (1) { - events_fire(loop); - timers_fire(&loop->time); - - times_update(); - if (events_waiting(loop)) + timers_fire(&loop->time, 0); + if (ev_run_list(&loop->event_list)) timeout = 0; else if (t = timers_first(&loop->time)) timeout = (tm_remains(t) TO_MS) + 1; @@ -472,8 +489,11 @@ birdloop_main(void *arg) if (loop->poll_changed) sockets_prepare(loop); - loop->poll_active = 1; - pthread_mutex_unlock(&loop->mutex); + btime duration = current_time() - loop_begin; + if (duration > config->watchdog_warning) + log(L_WARN "I/O loop cycle took %d ms", (int) (duration TO_MS)); + + birdloop_leave(loop); try: rv = poll(loop->poll_fd.data, loop->poll_fd.used, timeout); @@ -484,25 +504,32 @@ birdloop_main(void *arg) die("poll: %m"); } - pthread_mutex_lock(&loop->mutex); - loop->poll_active = 0; + birdloop_enter(loop); if (loop->close_scheduled) sockets_close_fds(loop); - if (loop->stop_called) + if (loop->stopped) break; + loop_begin = current_time(); + if (rv) sockets_fire(loop); - timers_fire(&loop->time); + atomic_exchange_explicit(&loop->ping_sent, 0, memory_order_acq_rel); } - loop->stop_called = 0; - pthread_mutex_unlock(&loop->mutex); + /* Flush remaining events */ + ASSERT_DIE(!ev_run_list(&loop->event_list)); - return NULL; + /* No timers allowed */ + ASSERT_DIE(timers_count(&loop->time) == 0); + ASSERT_DIE(EMPTY_LIST(loop->sock_list)); + ASSERT_DIE(loop->sock_num == 0); + + birdloop_leave(loop); + loop->stopped(loop->stop_data); } diff --git a/sysdep/unix/io-loop.h b/sysdep/unix/io-loop.h index d858b04e..4024b6c5 100644 --- a/sysdep/unix/io-loop.h +++ b/sysdep/unix/io-loop.h @@ -4,31 +4,32 @@ * Can be freely distributed and used under the terms of the GNU GPL. */ -#ifndef _BIRD_IO_LOOP_H_ -#define _BIRD_IO_LOOP_H_ +#ifndef _BIRD_SYSDEP_UNIX_IO_LOOP_H_ +#define _BIRD_SYSDEP_UNIX_IO_LOOP_H_ -#include "nest/bird.h" -#include "lib/lists.h" -#include "lib/resource.h" -#include "lib/event.h" -#include "lib/timer.h" -#include "lib/socket.h" +struct birdloop +{ + pool *pool; + struct timeloop time; + event_list event_list; + list sock_list; + uint sock_num; -void ev2_schedule(event *e); + BUFFER(sock *) poll_sk; + BUFFER(struct pollfd) poll_fd; + u8 poll_changed; + u8 close_scheduled; -void sk_start(sock *s); -void sk_stop(sock *s); + _Atomic u32 ping_sent; + int wakeup_fds[2]; -struct birdloop *birdloop_new(void); -void birdloop_start(struct birdloop *loop); -void birdloop_stop(struct birdloop *loop); -void birdloop_free(struct birdloop *loop); + uint links; -void birdloop_enter(struct birdloop *loop); -void birdloop_leave(struct birdloop *loop); -void birdloop_mask_wakeups(struct birdloop *loop); -void birdloop_unmask_wakeups(struct birdloop *loop); + void (*stopped)(void *data); + void *stop_data; + struct birdloop *prev_loop; +}; -#endif /* _BIRD_IO_LOOP_H_ */ +#endif diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index 90bb5d64..c91f2856 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -43,6 +43,7 @@ #include "conf/conf.h" #include "sysdep/unix/unix.h" +#include "sysdep/unix/io-loop.h" #include CONFIG_INCLUDE_SYSIO_H /* Maximum number of calls of tx handler for one socket in one @@ -800,18 +801,16 @@ sk_free(resource *r) sk_ssh_free(s); #endif - if (s->fd < 0) + if ((s->fd < 0) || (s->flags & SKF_THREAD)) return; - /* FIXME: we should call sk_stop() for SKF_THREAD sockets */ - if (!(s->flags & SKF_THREAD)) - { - if (s == current_sock) - current_sock = sk_next(s); - if (s == stored_sock) - stored_sock = sk_next(s); + if (s == current_sock) + current_sock = sk_next(s); + if (s == stored_sock) + stored_sock = sk_next(s); + + if (enlisted(&s->n)) rem_node(&s->n); - } if (s->type != SK_SSH && s->type != SK_SSH_ACTIVE) close(s->fd); @@ -1104,7 +1103,11 @@ sk_passive_connected(sock *s, int type) return 1; } - sk_insert(t); + if (s->flags & SKF_PASSIVE_THREAD) + t->flags |= SKF_THREAD; + else + sk_insert(t); + sk_alloc_bufs(t); s->rx_hook(t, 0); return 1; @@ -1508,6 +1511,36 @@ sk_open_unix(sock *s, char *name) return 0; } +static void +sk_reloop_hook(void *_vs) +{ + sock *s = _vs; + if (birdloop_inside(&main_birdloop)) + { + s->flags &= ~SKF_THREAD; + sk_insert(s); + } + else + { + s->flags |= SKF_THREAD; + sk_start(s); + } +} + +void +sk_reloop(sock *s, struct birdloop *loop) +{ + if (enlisted(&s->n)) + rem_node(&s->n); + + s->reloop = (event) { + .hook = sk_reloop_hook, + .data = s, + }; + + ev_send_loop(loop, &s->reloop); +} + #define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \ CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL) @@ -2143,8 +2176,9 @@ void io_init(void) { init_list(&sock_list); - init_list(&global_event_list); - init_list(&global_work_list); + ev_init_list(&global_event_list, &main_birdloop, "Global event list"); + ev_init_list(&global_work_list, &main_birdloop, "Global work list"); + ev_init_list(&main_birdloop.event_list, &main_birdloop, "Global fast event list"); krt_io_init(); // XXX init_times(); // XXX update_times(); @@ -2158,14 +2192,7 @@ static int short_loops = 0; #define SHORT_LOOP_MAX 10 #define WORK_EVENTS_MAX 10 -static int poll_reload_pipe[2]; - -void -io_loop_reload(void) -{ - char b; - write(poll_reload_pipe[1], &b, 1); -} +void pipe_drain(int fd); void io_loop(void) @@ -2178,21 +2205,19 @@ io_loop(void) int fdmax = 256; struct pollfd *pfd = xmalloc(fdmax * sizeof(struct pollfd)); - if (pipe(poll_reload_pipe) < 0) - die("pipe(poll_reload_pipe) failed: %m"); - watchdog_start1(); for(;;) { times_update(); events = ev_run_list(&global_event_list); events = ev_run_list_limited(&global_work_list, WORK_EVENTS_MAX) || events; - timers_fire(&main_timeloop); + events = ev_run_list(&main_birdloop.event_list) || events; + timers_fire(&main_birdloop.time, 1); io_close_event(); // FIXME poll_tout = (events ? 0 : 3000); /* Time in milliseconds */ - if (t = timers_first(&main_timeloop)) + if (t = timers_first(&main_birdloop.time)) { times_update(); timeout = (tm_remains(t) TO_MS) + 1; @@ -2200,7 +2225,7 @@ io_loop(void) } /* A hack to reload main io_loop() when something has changed asynchronously. */ - pfd[0].fd = poll_reload_pipe[0]; + pfd[0].fd = main_birdloop.wakeup_fds[0]; pfd[0].events = POLLIN; nfds = 1; @@ -2263,9 +2288,9 @@ io_loop(void) /* And finally enter poll() to find active sockets */ watchdog_stop(); - the_bird_unlock(); + birdloop_leave(&main_birdloop); pout = poll(pfd, nfds, poll_tout); - the_bird_lock(); + birdloop_enter(&main_birdloop); watchdog_start(); if (pout < 0) @@ -2279,8 +2304,8 @@ io_loop(void) if (pfd[0].revents & POLLIN) { /* IO loop reload requested */ - char b; - read(poll_reload_pipe[0], &b, 1); + pipe_drain(main_birdloop.wakeup_fds[0]); + atomic_exchange_explicit(&main_birdloop.ping_sent, 0, memory_order_acq_rel); continue; } diff --git a/sysdep/unix/main.c b/sysdep/unix/main.c index d35424ff..5da27cb6 100644 --- a/sysdep/unix/main.c +++ b/sysdep/unix/main.c @@ -908,10 +908,12 @@ main(int argc, char **argv) parse_args(argc, argv); log_switch(1, NULL, NULL); + the_bird_lock(); + random_init(); net_init(); resource_init(); - timer_init(); + birdloop_init(); olock_init(); io_init(); rt_init(); @@ -961,7 +963,6 @@ main(int argc, char **argv) dup2(0, 2); } - the_bird_lock(); main_thread_init(); diff --git a/sysdep/unix/unix.h b/sysdep/unix/unix.h index 313c97c3..ad85d1ea 100644 --- a/sysdep/unix/unix.h +++ b/sysdep/unix/unix.h @@ -106,7 +106,6 @@ extern volatile sig_atomic_t async_shutdown_flag; void io_init(void); void io_loop(void); -void io_loop_reload(void); void io_log_dump(void); int sk_open_unix(struct birdsock *s, char *name); struct rfile *rf_open(struct pool *, const char *name, const char *mode); diff --git a/test/bt-utils.c b/test/bt-utils.c index cbca3a6b..90815e77 100644 --- a/test/bt-utils.c +++ b/test/bt-utils.c @@ -53,16 +53,20 @@ cf_file_read(byte *dest, uint max_len, int fd) return l; } +void resource_sys_init(void); + void bt_bird_init(void) { + resource_sys_init(); if(bt_verbose) log_init_debug(""); log_switch(bt_verbose != 0, NULL, NULL); + the_bird_lock(); resource_init(); olock_init(); - timer_init(); + birdloop_init(); io_init(); rt_init(); if_init(); @@ -79,6 +83,7 @@ void bt_bird_cleanup(void) class_to_protocol[i] = NULL; config = new_config = NULL; + the_bird_unlock(); } static char * From 6e841b3153565632b6753f6b1fe74850c37f2808 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Wed, 20 Oct 2021 23:08:58 +0200 Subject: [PATCH 18/59] Adding a generic cork mechanism for events --- lib/event.c | 76 +++++++++++++++++++++++++++++++++++++++---- lib/event.h | 30 +++++++++++++++++ lib/locking.h | 1 + lib/socket.h | 1 + sysdep/unix/io-loop.c | 2 +- sysdep/unix/io.c | 2 +- 6 files changed, 103 insertions(+), 9 deletions(-) diff --git a/lib/event.c b/lib/event.c index 6c5c8b14..5031f314 100644 --- a/lib/event.c +++ b/lib/event.c @@ -114,17 +114,42 @@ ev_send(event_list *l, event *e) e->list = l; - LOCK_DOMAIN(event, l->lock); - if (enlisted(&e->n)) + struct event_cork *ec = e->cork; + + uint ping = 0; + + if (ec) { + LOCK_DOMAIN(cork, ec->lock); + LOCK_DOMAIN(event, l->lock); + + if (!enlisted(&e->n)) + if (ec->count) + add_tail(&ec->events, &e->n); + else + { + add_tail(&l->events, &e->n); + ping = 1; + } + + UNLOCK_DOMAIN(event, l->lock); + UNLOCK_DOMAIN(cork, ec->lock); + } + else + { + LOCK_DOMAIN(event, l->lock); + + if (!enlisted(&e->n)) + { + add_tail(&l->events, &e->n); + ping = 1; + } + UNLOCK_DOMAIN(event, l->lock); - return; } - add_tail(&l->events, &e->n); - UNLOCK_DOMAIN(event, l->lock); - - birdloop_ping(l->loop); + if (ping) + birdloop_ping(l->loop); } void io_log_event(void *hook, void *data); @@ -224,3 +249,40 @@ ev_run_list_limited(event_list *l, uint limit) return repeat; } + +void ev_cork(struct event_cork *ec) +{ + LOCK_DOMAIN(cork, ec->lock); + ec->count++; + UNLOCK_DOMAIN(cork, ec->lock); +} + +void ev_uncork(struct event_cork *ec) +{ + LOCK_DOMAIN(cork, ec->lock); + + if (--ec->count) + { + UNLOCK_DOMAIN(cork, ec->lock); + return; + } + + node *n; + WALK_LIST_FIRST(n, ec->events) + { + event *e = SKIP_BACK(event, n, n); + event_list *el = e->list; + + rem_node(&e->n); + + LOCK_DOMAIN(event, el->lock); + add_tail(&el->events, &e->n); + UNLOCK_DOMAIN(event, el->lock); + + birdloop_ping(el->loop); + } + + UNLOCK_DOMAIN(cork, ec->lock); + + birdloop_ping(&main_birdloop); +} diff --git a/lib/event.h b/lib/event.h index 6c358f84..cd85bf78 100644 --- a/lib/event.h +++ b/lib/event.h @@ -15,6 +15,7 @@ #include DEFINE_DOMAIN(event); +DEFINE_DOMAIN(cork); typedef struct event { resource r; @@ -22,6 +23,8 @@ typedef struct event { void *data; node n; /* Internal link */ struct event_list *list; /* List where this event is put in */ + struct event_cork *cork; /* Event execution limiter */ + node cork_node; } event; typedef struct event_list { @@ -31,6 +34,12 @@ typedef struct event_list { DOMAIN(event) lock; } event_list; +struct event_cork { + DOMAIN(cork) lock; + u32 count; + list events; +}; + extern event_list global_event_list; extern event_list global_work_list; @@ -44,6 +53,13 @@ static inline void ev_init_list(event_list *el, struct birdloop *loop, const cha el->lock = DOMAIN_NEW(event, name); } +static inline void ev_init_cork(struct event_cork *ec, const char *name) +{ + init_list(&ec->events); + ec->lock = DOMAIN_NEW(cork, name); + ec->count = 0; +}; + void ev_send(event_list *, event *); #define ev_send_loop(l, e) ev_send(birdloop_event_list((l)), (e)) @@ -56,6 +72,20 @@ int ev_run_list_limited(event_list *, uint); #define LEGACY_EVENT_LIST(l) (((l) == &global_event_list) || ((l) == &global_work_list)) +void ev_cork(struct event_cork *); +void ev_uncork(struct event_cork *); + +static inline u32 ev_corked(struct event_cork *ec) +{ + if (!ec) + return 0; + + LOCK_DOMAIN(cork, ec->lock); + u32 out = ec->count; + UNLOCK_DOMAIN(cork, ec->lock); + return out; +} + _Bool birdloop_inside(struct birdloop *loop); static inline int diff --git a/lib/locking.h b/lib/locking.h index ab5c06af..0cbdead8 100644 --- a/lib/locking.h +++ b/lib/locking.h @@ -16,6 +16,7 @@ struct lock_order { struct domain_generic *the_bird; struct domain_generic *proto; struct domain_generic *rtable; + struct domain_generic *cork; struct domain_generic *event; }; diff --git a/lib/socket.h b/lib/socket.h index 5bdab7f3..ff07660f 100644 --- a/lib/socket.h +++ b/lib/socket.h @@ -57,6 +57,7 @@ typedef struct birdsock { uint fast_rx; /* RX has higher priority in event loop */ uint rbsize; int (*rx_hook)(struct birdsock *, uint size); /* NULL=receiving turned off, returns 1 to clear rx buffer */ + struct event_cork *cork; /* Cork to temporarily stop receiving data */ byte *tbuf, *tpos; /* NULL=allocate automatically */ byte *ttx; /* Internal */ diff --git a/sysdep/unix/io-loop.c b/sysdep/unix/io-loop.c index 275d38a0..c7cf4ad2 100644 --- a/sysdep/unix/io-loop.c +++ b/sysdep/unix/io-loop.c @@ -205,7 +205,7 @@ sk_stop(sock *s) } static inline uint sk_want_events(sock *s) -{ return (s->rx_hook ? POLLIN : 0) | ((s->ttx != s->tpos) ? POLLOUT : 0); } +{ return ((s->rx_hook && !ev_corked(s->cork)) ? POLLIN : 0) | ((s->ttx != s->tpos) ? POLLOUT : 0); } /* FIXME: this should be called from sock code diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index c91f2856..dd385c80 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -2234,7 +2234,7 @@ io_loop(void) { pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */ s = SKIP_BACK(sock, n, n); - if (s->rx_hook) + if (s->rx_hook && !ev_corked(s->cork)) { pfd[nfds].fd = s->fd; pfd[nfds].events |= POLLIN; From 3b20722a1fc777c27ab2e0451d0ea3fee7fa81a2 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Mon, 27 Sep 2021 14:08:03 +0200 Subject: [PATCH 19/59] Table cork: Stop creating updates when there are too many pending. The corked procedure gets a callback when uncorked. Supported by table maintenance routines and also BGP. --- nest/route.h | 6 ++++++ nest/rt-table.c | 23 ++++++++++++++++++++++- proto/bgp/bgp.c | 1 + 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/nest/route.h b/nest/route.h index 88a56073..310cea92 100644 --- a/nest/route.h +++ b/nest/route.h @@ -157,6 +157,7 @@ struct rtable_config { btime min_settle_time; /* Minimum settle time for notifications */ btime max_settle_time; /* Maximum settle time for notifications */ btime export_settle_time; /* Delay before exports are announced */ + uint cork_limit; /* Amount of routes to be pending on export to cork imports */ }; typedef struct rtable { @@ -187,6 +188,9 @@ typedef struct rtable { int gc_counter; /* Number of operations since last GC */ byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */ byte nhu_state; /* Next Hop Update state */ + + byte cork_active; /* Congestion control activated */ + struct fib_iterator prune_fit; /* Rtable prune FIB iterator */ struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */ struct tbf rl_pipe; /* Rate limiting token buffer for pipe collisions */ @@ -378,6 +382,8 @@ struct rt_export_hook { void (*stopped)(struct rt_export_request *); /* Stored callback when export is stopped */ }; +extern struct event_cork rt_cork; + #define TIS_DOWN 0 #define TIS_UP 1 #define TIS_STOP 2 diff --git a/nest/rt-table.c b/nest/rt-table.c index 6dd6587e..b4cd0448 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -110,6 +110,7 @@ const char *rt_export_state_name(u8 state) return rt_export_state_name_array[state]; } +struct event_cork rt_cork; /* Like fib_route(), but skips empty net entries */ static inline void * @@ -1061,7 +1062,13 @@ rte_announce(rtable *tab, net *net, struct rte_storage *new, struct rte_storage if (tab->first_export == NULL) tab->first_export = rpe; - if (!tm_active(tab->export_timer)) + if ((tab->first_export->seq + tab->config->cork_limit <= tab->next_export_seq) && !tab->cork_active) + { + ev_cork(&rt_cork); + tab->cork_active = 1; + tm_start(tab->export_timer, 0); + } + else if (!tm_active(tab->export_timer)) tm_start(tab->export_timer, tab->config->export_settle_time); } @@ -2042,6 +2049,7 @@ rt_free(resource *_r) DBG("Deleting routing table %s\n", r->name); ASSERT_DIE(r->use_count == 0); ASSERT_DIE(r->rt_count == 0); + ASSERT_DIE(!r->cork_active); ASSERT_DIE(EMPTY_LIST(r->imports)); ASSERT_DIE(EMPTY_LIST(r->exports)); @@ -2106,6 +2114,9 @@ rt_setup(pool *pp, struct rtable_config *cf) t->hcu_event = ev_new_init(p, rt_update_hostcache, t); t->nhu_event = ev_new_init(p, rt_next_hop_update, t); + t->nhu_event->cork = &rt_cork; + t->prune_event->cork = &rt_cork; + t->export_timer = tm_new_init(p, rt_announce_exports, t, 0, 0); t->last_rt_change = t->gc_time = current_time(); t->next_export_seq = 1; @@ -2128,6 +2139,7 @@ rt_init(void) rt_table_pool = rp_new(&root_pool, "Routing tables"); rte_update_pool = lp_new_default(rt_table_pool); init_list(&routing_tables); + ev_init_cork(&rt_cork, "Route Table Cork"); } /** @@ -2420,6 +2432,14 @@ done:; if (EMPTY_LIST(tab->pending_exports) && tm_active(tab->export_timer)) tm_stop(tab->export_timer); + + /* If reduced to at most one export block pending */ + if (tab->cork_active && + ((!tab->first_export) || (tab->first_export->seq + 128 > tab->next_export_seq))) + { + tab->cork_active = 0; + ev_uncork(&rt_cork); + } } void @@ -2699,6 +2719,7 @@ rt_new_table(struct symbol *s, uint addr_type) c->gc_min_time = 5; c->min_settle_time = 1 S; c->max_settle_time = 20 S; + c->cork_limit = 4 * page_size / sizeof(struct rt_pending_export); c->config = new_config; add_tail(&new_config->tables, &c->n); diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 96df671f..5f2e7dfd 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -584,6 +584,7 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) p->link_addr = p->neigh->iface->llv6->ip; conn->sk->fast_rx = 0; + conn->sk->cork = &rt_cork; p->conn = conn; p->last_error_class = 0; From a8a3d95be5db1a8a7d5a17e2eb8e233417b1d8c7 Mon Sep 17 00:00:00 2001 From: Ondrej Zajicek Date: Sat, 4 Jun 2022 17:34:57 +0200 Subject: [PATCH 20/59] Nest: Improve GC strategy for rtables Use timer (configurable as 'gc period') to schedule routing table GC/pruning to ensure that prune is done on time but not too often. Randomize GC timers to avoid concentration of GC events from different tables in one loop cycle. Fix a bug that caused minimum inter-GC interval be 5 us instead of 5 s. Make default 'gc period' adaptive based on number of routing tables, from 10 s for small setups to 600 s for large ones. In marge multi-table RS setup, the patch improved time of flushing a downed peer from 20-30 min to <2 min and removed 40s latencies. --- conf/conf.c | 1 + doc/bird.sgml | 15 ++++++++++++++ nest/config.Y | 4 +++- nest/route.h | 8 +++++--- nest/rt-table.c | 54 +++++++++++++++++++++++++++++++++++++++++-------- 5 files changed, 70 insertions(+), 12 deletions(-) diff --git a/conf/conf.c b/conf/conf.c index a2b01667..025c040e 100644 --- a/conf/conf.c +++ b/conf/conf.c @@ -140,6 +140,7 @@ config_parse(struct config *c) protos_preconfig(c); rt_preconfig(c); cf_parse(); + rt_postconfig(c); if (EMPTY_LIST(c->protos)) cf_error("No protocol is specified in the config file"); diff --git a/doc/bird.sgml b/doc/bird.sgml index 1580facd..326fc7a8 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -684,6 +684,21 @@ to set options. limit to the settle time from the initial ROA table change even if there are consecutive updates gradually renewing the settle time. Default: 20 s. + +