0
0
mirror of https://gitlab.nic.cz/labs/bird.git synced 2024-12-22 01:31:55 +00:00

Protocol restart timer reworked.

The restart timer was racy and didn't allow for immediate restarts
from limits. Now the protocols stores the last restart time and in case
of too frequent autorestarts caused by exceeded limits, the protocol
gets disabled with an error message.

Also now there is a configuration knob for this.
This commit is contained in:
Maria Matejka 2024-11-25 12:02:13 +01:00
parent 53431ff679
commit d265aefd32
4 changed files with 25 additions and 36 deletions

View File

@ -839,6 +839,13 @@ agreement").
command line interface without needing to touch the configuration.
Disabled protocols are not activated. Default: protocol is enabled.
<tag><label id="proto-restart-limit">restart time limit <m/time/</tag>
Set time limit for subsequent automatic restarts of the protocol.
If the protocol hits the limit (with a restart action) before this time
elapses from starting the protocol, the protocol is disabled with
an error message in the config file. This doesn't apply to manual
restarts or reconfiguration. Default: 5 s.
<tag><label id="proto-debug">debug all|off|{ states|routes|filters|interfaces|events|packets [, <m/.../] }</tag>
Set protocol debugging options. If asked, each protocol is capable of
writing trace messages about its work to the log (with category

View File

@ -340,6 +340,7 @@ proto_name:
proto_item:
/* EMPTY */
| DISABLED bool { this_proto->disabled = $2; }
| RESTART TIME expr_us { this_proto->restart_limit = $3; }
| DEBUG debug_mask { this_proto->debug = $2; }
| MRTDUMP mrtdump_mask { this_proto->mrtdump = $2; }
| ROUTER ID idval { this_proto->router_id = $3; }

View File

@ -1335,6 +1335,9 @@ proto_new(struct proto_config *cf)
p->hash_key = random_u32();
cf->proto = p;
p->last_restart = current_time();
p->restart_limit = cf->restart_limit;
PST_LOCKED(tp)
{
p->id = hmap_first_zero(&tp->proto_id_map);
@ -1454,6 +1457,8 @@ proto_config_new(struct protocol *pr, int class)
cf->mrtdump = new_config->proto_default_mrtdump;
cf->loop_order = DOMAIN_ORDER(the_bird);
cf->restart_limit = 5 S;
init_list(&cf->channels);
return cf;
@ -1561,7 +1566,7 @@ static int
proto_reconfigure(struct proto *p, struct proto_config *oc, struct proto_config *nc, int type)
{
/* If the protocol is DOWN, we just restart it */
if (p->proto_state == PS_DOWN_XX)
if ((p->proto_state == PS_DOWN_XX) || (p->proto_state == PS_FLUSH))
return 0;
/* If there is a too big change in core attributes, ... */
@ -1574,6 +1579,7 @@ proto_reconfigure(struct proto *p, struct proto_config *oc, struct proto_config
p->sources.name = p->name = nc->name;
p->sources.debug = p->debug = nc->debug;
p->mrtdump = nc->mrtdump;
p->restart_limit = nc->restart_limit;
reconfigure_type = type;
/* Execute protocol specific reconfigure hook */
@ -2167,29 +2173,15 @@ proto_restart_event_hook(void *_p)
p->disabled = 1;
proto_rethink_goal(p);
p->restart_event = NULL;
p->restart_timer = NULL;
if (proto_restart)
if (current_time_now() - p->last_restart < p->restart_limit)
log(L_ERR "%s: too frequent restarts, disabling", p->name);
else
p->disabled = 0;
/* No need to call proto_rethink_goal() here again as the proto_cleanup() routine will
* call it after the protocol stops ... and both these routines are fixed to main_birdloop.
*/
p->disabled = 0;
}
static void
proto_send_restart_event(struct proto *p)
{
if (!p->restart_event)
p->restart_event = ev_new_init(p->pool, proto_restart_event_hook, p);
ev_send(&global_event_list, p->restart_event);
}
static void
proto_send_restart_event_from_timer(struct timer *t)
{
proto_send_restart_event((struct proto *) t->data);
}
static inline void
@ -2205,20 +2197,8 @@ proto_schedule_down(struct proto *p, byte restart, byte code)
p->down_sched = restart ? PDS_RESTART : PDS_DISABLE;
p->down_code = code;
if (!restart)
{
if (p->restart_timer && tm_active(p->restart_timer))
tm_stop(p->restart_timer);
proto_send_restart_event(p);
}
else
{
if (!p->restart_timer)
p->restart_timer = tm_new_init(p->pool, proto_send_restart_event_from_timer, p, 0, 0);
tm_start_max_in(p->restart_timer, 250 MS, p->loop);
}
/* Request protocol restart to be initiated from the mainloop */
ev_send(&global_event_list, ev_new_init(p->pool, proto_restart_event_hook, p));
}
/**

View File

@ -113,6 +113,7 @@ struct proto_config {
u32 router_id; /* Protocol specific router ID */
uint loop_order; /* Launch a birdloop on this locking level; use DOMAIN_ORDER(the_bird) for mainloop */
btime loop_max_latency; /* Request this specific maximum latency of loop; zero to default */
btime restart_limit; /* Minimum allowed time between limit restarts */
list channels; /* List of channel configs (struct channel_config) */
struct iface *vrf; /* Related VRF instance, NULL if global */
@ -142,8 +143,6 @@ struct proto {
pool *pool_inloop; /* Pool containing local objects which need to be freed
before the protocol's birdloop actually stops, like olocks */
event *event; /* Protocol event */
timer *restart_timer; /* Timer to restart the protocol from limits */
event *restart_event; /* Event to restart/shutdown the protocol from limits */
struct birdloop *loop; /* BIRDloop running this protocol */
list channels; /* List of channels to rtables (struct channel) */
@ -170,6 +169,8 @@ struct proto {
byte down_code; /* Reason for shutdown (PDC_* codes) */
u32 hash_key; /* Random key used for hashing of neighbors */
btime last_state_change; /* Time of last state transition */
btime last_restart; /* Time of last restart */
btime restart_limit; /* Minimum allowed time between limit restarts */
char *last_state_name_announced; /* Last state name we've announced to the user */
char *message; /* State-change message, allocated from proto_pool */
u32 id; /* Sequential ID used as index in proto_state_table */