From f7639a9fafa7411ebd1f2af56c270b970ac09f3d Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Mon, 23 Dec 2024 21:06:26 +0100 Subject: [PATCH] Graceful recovery: converted to obstacles Yet another refcounting mechanism had a locking collision. --- nest/proto.c | 178 ++++++++++++++++++++++++++---------------------- nest/protocol.h | 14 +++- 2 files changed, 110 insertions(+), 82 deletions(-) diff --git a/nest/proto.c b/nest/proto.c index 6fa74e9f..caf99829 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -31,15 +31,8 @@ static list STATIC_LIST_INIT(protocol_list); #define CD(c, msg, args...) ({ if (c->debug & D_STATES) log(L_TRACE "%s.%s: " msg, c->proto->name, c->name ?: "?", ## args); }) #define PD(p, msg, args...) ({ if (p->debug & D_STATES) log(L_TRACE "%s: " msg, p->name, ## args); }) -static timer *gr_wait_timer; - -#define GRS_NONE 0 -#define GRS_INIT 1 -#define GRS_ACTIVE 2 -#define GRS_DONE 3 - -static int graceful_restart_state; -static u32 graceful_restart_locks; +static struct graceful_recovery_context _graceful_recovery_context; +OBSREF(struct graceful_recovery_context) graceful_recovery_context; static char *p_states[] = { "DOWN", "START", "UP", "STOP" }; static char *c_states[] = { "DOWN", "START", "UP", "STOP", "RESTART" }; @@ -912,7 +905,7 @@ channel_do_stop(struct channel *c) ev_postpone(&c->reimport_event); c->gr_wait = 0; - if (c->gr_lock) + if (OBSREF_GET(c->gr_lock)) channel_graceful_restart_unlock(c); CALL(c->class->shutdown, c); @@ -1407,7 +1400,7 @@ proto_start(struct proto *p) DBG("Kicking %s up\n", p->name); PD(p, "Starting"); - if (graceful_restart_state == GRS_INIT) + if (OBSREF_GET(graceful_recovery_context)) p->gr_recovery = 1; if (p->cf->loop_order != DOMAIN_ORDER(the_bird)) @@ -1921,7 +1914,45 @@ proto_enable(struct proto *p) * */ -static void graceful_restart_done(timer *t); +/** + * graceful_restart_done - finalize graceful restart + * @t: unused + * + * When there are no locks on graceful restart, the functions finalizes the + * graceful restart recovery. Protocols postponing route export until the end of + * the recovery are awakened and the export to them is enabled. + */ +static void +graceful_recovery_done(struct callback *_ UNUSED) +{ + ASSERT_DIE(birdloop_inside(&main_birdloop)); + ASSERT_DIE(_graceful_recovery_context.grc_state == GRS_ACTIVE); + + tm_stop(&_graceful_recovery_context.wait_timer); + log(L_INFO "Graceful recovery done"); + + WALK_TLIST(proto, p, &global_proto_list) + PROTO_LOCKED_FROM_MAIN(p) + { + p->gr_recovery = 0; + + struct channel *c; + WALK_LIST(c, p->channels) + { + ASSERT_DIE(!OBSREF_GET(c->gr_lock)); + + /* Resume postponed export of routes */ + if ((c->channel_state == CS_UP) && c->gr_wait && p->rt_notify) + channel_start_export(c); + + /* Cleanup */ + c->gr_wait = 0; + } + } + + _graceful_recovery_context.grc_state = GRS_DONE; +} + /** * graceful_restart_recovery - request initial graceful restart recovery @@ -1933,7 +1964,30 @@ static void graceful_restart_done(timer *t); void graceful_restart_recovery(void) { - graceful_restart_state = GRS_INIT; + obstacle_target_init( + &_graceful_recovery_context.obstacles, + &_graceful_recovery_context.obstacles_cleared, + &root_pool, "Graceful recovery"); + + OBSREF_SET(graceful_recovery_context, &_graceful_recovery_context); + _graceful_recovery_context.grc_state = GRS_INIT; +} + +static void +graceful_recovery_timeout(timer *t UNUSED) +{ + log(L_INFO "Graceful recovery timeout"); + WALK_TLIST(proto, p, &global_proto_list) + PROTO_LOCKED_FROM_MAIN(p) + { + struct channel *c; + WALK_LIST(c, p->channels) + if (OBSREF_GET(c->gr_lock)) + { + log(L_INFO "Graceful recovery: Not waiting for %s.%s", p->name, c->name); + OBSREF_CLEAR(c->gr_lock); + } + } } /** @@ -1946,73 +2000,35 @@ graceful_restart_recovery(void) void graceful_restart_init(void) { - if (!graceful_restart_state) + if (!OBSREF_GET(graceful_recovery_context)) return; - log(L_INFO "Graceful restart started"); + log(L_INFO "Graceful recovery started"); - if (!graceful_restart_locks) - { - graceful_restart_done(NULL); - return; - } + _graceful_recovery_context.grc_state = GRS_ACTIVE; - graceful_restart_state = GRS_ACTIVE; - gr_wait_timer = tm_new_init(proto_pool, graceful_restart_done, NULL, 0, 0); + _graceful_recovery_context.wait_timer = (timer) { .hook = graceful_recovery_timeout }; u32 gr_wait = atomic_load_explicit(&global_runtime, memory_order_relaxed)->gr_wait; - tm_start(gr_wait_timer, gr_wait S); -} + tm_start(&_graceful_recovery_context.wait_timer, gr_wait S); -/** - * graceful_restart_done - finalize graceful restart - * @t: unused - * - * When there are no locks on graceful restart, the functions finalizes the - * graceful restart recovery. Protocols postponing route export until the end of - * the recovery are awakened and the export to them is enabled. All other - * related state is cleared. The function is also called when the graceful - * restart wait timer fires (but there are still some locks). - */ -static void -graceful_restart_done(timer *t) -{ - log(L_INFO "Graceful restart done"); - graceful_restart_state = GRS_DONE; + callback_init(&_graceful_recovery_context.obstacles_cleared, graceful_recovery_done, &main_birdloop); - WALK_TLIST(proto, p, &global_proto_list) - { - if (!p->gr_recovery) - continue; - - struct channel *c; - WALK_LIST(c, p->channels) - { - /* Resume postponed export of routes */ - if ((c->channel_state == CS_UP) && c->gr_wait && p->rt_notify) - channel_start_export(c); - - /* Cleanup */ - c->gr_wait = 0; - c->gr_lock = 0; - } - - p->gr_recovery = 0; - } - - graceful_restart_locks = 0; - - rfree(t); + /* The last clearing of obstacle reference will cause + * the graceful recovery finish immediately. */ + OBSREF_CLEAR(graceful_recovery_context); } void graceful_restart_show_status(void) { - if (graceful_restart_state != GRS_ACTIVE) + if (_graceful_recovery_context.grc_state != GRS_ACTIVE) return; cli_msg(-24, "Graceful restart recovery in progress"); - cli_msg(-24, " Waiting for %d channels to recover", graceful_restart_locks); - cli_msg(-24, " Wait timer is %t/%u", tm_remains(gr_wait_timer), + cli_msg(-24, " Waiting for %u channels to recover", + obstacle_target_count(&_graceful_recovery_context.obstacles)); + cli_msg(-24, " Wait timer is %t/%u", + tm_remains(&_graceful_recovery_context.wait_timer), atomic_load_explicit(&global_runtime, memory_order_relaxed)->gr_wait); } @@ -2032,14 +2048,22 @@ graceful_restart_show_status(void) void channel_graceful_restart_lock(struct channel *c) { - ASSERT(graceful_restart_state == GRS_INIT); - ASSERT(c->proto->gr_recovery); + ASSERT_DIE(birdloop_inside(&main_birdloop)); - if (c->gr_lock) + if (OBSREF_GET(c->gr_lock)) return; - c->gr_lock = 1; - graceful_restart_locks++; + switch (_graceful_recovery_context.grc_state) + { + case GRS_INIT: + case GRS_ACTIVE: + OBSREF_SET(c->gr_lock, &_graceful_recovery_context); + break; + + case GRS_NONE: + case GRS_DONE: + break; + } } /** @@ -2052,18 +2076,10 @@ channel_graceful_restart_lock(struct channel *c) void channel_graceful_restart_unlock(struct channel *c) { - if (!c->gr_lock) - return; - - c->gr_lock = 0; - graceful_restart_locks--; - - if ((graceful_restart_state == GRS_ACTIVE) && !graceful_restart_locks) - tm_start(gr_wait_timer, 0); + OBSREF_CLEAR(c->gr_lock); } - /** * protos_dump_all - dump status of all protocols * @@ -2615,9 +2631,9 @@ channel_show_info(struct channel *c) cli_msg(-1006, " Input filter: %s", filter_name(c->in_filter)); cli_msg(-1006, " Output filter: %s", filter_name(c->out_filter)); - if (graceful_restart_state == GRS_ACTIVE) + if (_graceful_recovery_context.grc_state == GRS_ACTIVE) cli_msg(-1006, " GR recovery: %s%s", - c->gr_lock ? " pending" : "", + OBSREF_GET(c->gr_lock) ? " pending" : "", c->gr_wait ? " waiting" : ""); channel_show_limit(&c->rx_limit, "Receive limit:", c->limit_active & (1 << PLD_RX), c->limit_actions[PLD_RX]); diff --git a/nest/protocol.h b/nest/protocol.h index 2bfa1628..ec561b26 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -659,7 +659,7 @@ struct channel { u8 channel_state; u8 reloadable; /* Hook reload_routes() is allowed on the channel */ - u8 gr_lock; /* Graceful restart mechanism should wait for this channel */ + OBSREF(struct graceful_recovery_context) gr_lock; /* Graceful restart mechanism should wait for this channel */ u8 gr_wait; /* Route export to channel is postponed until graceful restart */ u32 obstacles; /* External obstacles remaining before cleanup */ @@ -763,4 +763,16 @@ void *channel_config_new(const struct channel_class *cc, const char *name, uint void *channel_config_get(const struct channel_class *cc, const char *name, uint net_type, struct proto_config *proto); int channel_reconfigure(struct channel *c, struct channel_config *cf); +struct graceful_recovery_context { + struct obstacle_target obstacles; + struct callback obstacles_cleared; + enum { + GRS_NONE, + GRS_INIT, + GRS_ACTIVE, + GRS_DONE, + } grc_state; + timer wait_timer; +}; + #endif